Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bd23910
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.swp
+*.s
+*.o
+*.out
+include/blasfeo_target.h
+libblasfeo.a
+libblasfeo.so
+octave-workspace
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..b7cfbf5
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,611 @@
+###################################################################################################
+# #
+# This file is part of HPIPM. #
+# #
+# HPIPM -- High Performance Interior Point Method. #
+# Copyright (C) 2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+cmake_minimum_required(VERSION 2.8.11)
+
+project(blasfeo)
+
+enable_language(C ASM)
+
+# Target architecture
+#set(TARGET X64_INTEL_HASWELL)
+set(TARGET X64_INTEL_SANDY_BRIDGE CACHE STRING "Target architecture")
+#set(TARGET X64_INTEL_CORE)
+#set(TARGET X64_AMD_BULLDOZER)
+#set(TARGET ARMV8A_ARM_CORTEX_A57)
+#set(TARGET ARMV7A_ARM_CORTEX_A15)
+#set(TARGET GENERIC)
+
+# Linear Algebra library
+set(LA HIGH_PERFORMANCE CACHE STRING "Linear algebra optimization level")
+#set(LA REFERENCE)
+#set(LA BLAS)
+
+# BLAS and LAPACK version (for LA=BLAS in BLASFEO)
+set(REF_BLAS 0 CACHE STRING "Reference blas to use")
+#set(REF_BLAS OPENBLAS)
+#set(REF_BLAS NETLIB)
+#set(REF_BLAS MKL)
+#set(REF_BLAS BLIS)
+#set(REF_BLAS ATLAS)
+
+# Compile auxiliary functions with external dependencies (for memory allocation and printing)
+set(EXT_DEP ON CACHE BOOL "Compile external dependencies in BLASFEO")
+
+configure_file(${PROJECT_SOURCE_DIR}/blasfeo_target.h.in
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/blasfeo_target.h @ONLY)
+
+# C Compiler
+# set(CC_COMPILER gcc CACHE STRING "compiler")
+#set(CC_COMPILER clang)
+#set(CC_COMPILER x86_64-w64-mingw32-gcc)
+
+# build shared library
+#set(BUILD_SHARED_LIBS ON CACHE STRING "Build shared libraries")
+
+# installation directory
+if(CMAKE_INSTALL_PREFIX MATCHES "/usr/local")
+ set(CMAKE_INSTALL_PREFIX "/opt/blasfeo")
+endif()
+
+# headers installation directory
+set(BLASFEO_HEADERS_INSTALLATION_DIRECTORY "include" CACHE STRING "Headers local installation directory")
+
+# Macro level (code size vs performance in assembly kernels): 0 (no macro), 1 (all macro but gemm kernel), 2 (all macro)
+set(MACRO_LEVEL 0)
+
+# enable runtine checks
+set(RUNTIME_CHECKS 0)
+#set(RUNTIME_CHECKS 0)
+
+# compiler flags
+if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(CMAKE_C_FLAGS "")
+ set(CMAKE_ASM_FLAGS "")
+ set(CMAKE_C_FLAGS_RELEASE "")
+ set(CMAKE_ASM_FLAGS_RELEASE "")
+ # optimization flags
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+ # debugging flags
+ #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g")
+ #set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g")
+endif()
+
+# search directories
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${BLASFEO_PATH}/include") XXX
+
+#
+if(${LA} MATCHES HIGH_PERFORMANCE)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DLA_HIGH_PERFORMANCE")
+endif()
+if(${LA} MATCHES REFERENCE)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DLA_REFERENCE")
+endif()
+if(${LA} MATCHES BLAS)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DLA_BLAS")
+endif()
+
+#
+if(${RUNTIME_CHECKS} MATCHES 1)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDIM_CHECK")
+endif()
+
+#
+if(${EXT_DEP})
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DEXT_DEP")
+endif()
+
+#
+if(${MACRO_LEVEL} MATCHES 1)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMACRO_LEVEL=1")
+endif()
+if(${MACRO_LEVEL} MATCHES 2)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMACRO_LEVEL=2")
+endif()
+
+#
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DOS_LINUX")
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DOS_LINUX")
+endif()
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DOS_MAC")
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DOS_MAC")
+endif()
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DOS_WINDOWS")
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DOS_WINDOWS")
+endif()
+
+#
+if(${REF_BLAS} MATCHES 0)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ")
+endif(${REF_BLAS} MATCHES 0)
+if(${REF_BLAS} MATCHES OPENBLAS)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_OPENBLAS -I/opt/openblas/include")
+endif(${REF_BLAS} MATCHES OPENBLAS)
+if(${REF_BLAS} MATCHES BLIS)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_BLIS -std=c99")
+endif(${REF_BLAS} MATCHES BLIS)
+if(${REF_BLAS} MATCHES NETLIB)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_NETLIB")
+endif(${REF_BLAS} MATCHES NETLIB)
+if(${REF_BLAS} MATCHES MKL)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_MKL -m64 -I/opt/intel/mkl/include")
+endif(${REF_BLAS} MATCHES MKL)
+if(${REF_BLAS} MATCHES ATLAS)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_ATLAS")
+endif(${REF_BLAS} MATCHES ATLAS)
+
+# architecture-specific flags
+if(${TARGET} MATCHES X64_INTEL_HASWELL)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_INTEL_HASWELL")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set()
+ endif()
+endif()
+
+if(${TARGET} MATCHES X64_INTEL_SANDY_BRIDGE)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_INTEL_SANDY_BRIDGE")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -mavx")
+ endif()
+endif()
+
+if(${TARGET} MATCHES X64_INTEL_CORE)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_INTEL_CORE")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse3")
+ endif()
+endif()
+
+if(${TARGET} MATCHES X64_AMD_BULLDOZER)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_AMD_BULLDOZER")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -mavx -mfma")
+ endif()
+endif()
+
+if(${TARGET} MATCHES ARMV8A_ARM_CORTEX_A57)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV8A_ARM_CORTEX_A57")
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV8A_ARM_CORTEX_A57")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto+fp+simd")
+ endif()
+endif()
+
+if(${TARGET} MATCHES ARMV7A_ARM_CORTEX_A15)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV7A_ARM_CORTEX_A15")
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV7A_ARM_CORTEX_A15")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a15")
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon-vfpv4")
+ endif()
+endif()
+
+if(${TARGET} MATCHES GENERIC)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_GENERIC")
+endif()
+
+
+
+# source files
+
+if(${LA} MATCHES HIGH_PERFORMANCE)
+
+ if(${TARGET} MATCHES X64_INTEL_HASWELL)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/avx/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/avx2/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib8.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib48.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_12x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_8x8_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_8x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemv_8_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dsymv_6_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgebp_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgelqf_4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_24x4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_16x4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_8x8_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_8x4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_diag_lib8.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgead_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgecp_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgetr_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgesc_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_8_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib8.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib8.c
+ )
+
+ endif(${TARGET} MATCHES X64_INTEL_HASWELL)
+
+ if(${TARGET} MATCHES X64_INTEL_SANDY_BRIDGE)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/avx/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/avx/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib8.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib48.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_8x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_12_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_8_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dsymv_6_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgebp_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgelqf_4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_16x4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_8x8_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_8x4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_diag_lib8.c
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgead_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgecp_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgetr_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgesc_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_8_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_4_lib8.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib8.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib8.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib8.c
+ )
+
+ endif(${TARGET} MATCHES X64_INTEL_SANDY_BRIDGE)
+
+ if(${TARGET} MATCHES X64_INTEL_CORE)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/sse3/kernel_dgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+ )
+
+ endif(${TARGET} MATCHES X64_INTEL_CORE)
+
+ if(${TARGET} MATCHES X64_AMD_BULLDOZER)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/fma/kernel_dgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+ )
+
+ endif(${TARGET} MATCHES X64_AMD_BULLDOZER)
+
+ if(${TARGET} MATCHES ARMV8A_ARM_CORTEX_A57)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_dgemm_8x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_dgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_16x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_12x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_8x8_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_8x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+ )
+
+ endif(${TARGET} MATCHES ARMV8A_ARM_CORTEX_A57)
+
+ if(${TARGET} MATCHES ARMV7A_ARM_CORTEX_A15)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_dgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_sgemm_12x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_sgemm_8x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_sgemm_4x4_lib4.S
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+ )
+
+ endif(${TARGET} MATCHES ARMV7A_ARM_CORTEX_A15)
+
+ if(${TARGET} MATCHES GENERIC)
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+ )
+
+ file(GLOB KERNEL_SRC
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+ ${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+ )
+
+ endif(${TARGET} MATCHES GENERIC)
+
+else(${LA} MATCHES HIGH_PERFORMANCE) # REFERENCE BLAS
+
+ file(GLOB AUX_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib.c
+ )
+
+ file(GLOB BLAS_SRC
+ ${PROJECT_SOURCE_DIR}/blas/d_blas1_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/d_lapack_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas1_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib.c
+ ${PROJECT_SOURCE_DIR}/blas/s_lapack_lib.c
+ )
+
+endif(${LA} MATCHES HIGH_PERFORMANCE)
+
+if(${EXT_DEP})
+
+ file(GLOB EXT_SRC
+ ${PROJECT_SOURCE_DIR}/auxiliary/d_aux_ext_dep_lib.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/s_aux_ext_dep_lib.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/v_aux_ext_dep_lib.c
+ ${PROJECT_SOURCE_DIR}/auxiliary/i_aux_ext_dep_lib.c
+ )
+
+endif()
+
+set(BLASFEO_SRC ${AUX_SRC} ${KERNEL_SRC} ${BLAS_SRC} ${EXT_SRC})
+
+# add library
+add_library(blasfeo ${BLASFEO_SRC})
+target_include_directories(blasfeo
+ PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+
+install(TARGETS blasfeo EXPORT blasfeoConfig
+ LIBRARY DESTINATION lib
+ ARCHIVE DESTINATION lib
+ RUNTIME DESTINATION bin)
+
+install(EXPORT blasfeoConfig DESTINATION cmake)
+
+file(GLOB_RECURSE BLASFEO_HEADERS "include/*.h")
+install(FILES ${BLASFEO_HEADERS} DESTINATION ${BLASFEO_HEADERS_INSTALLATION_DIRECTORY})
+
+# test problems
+# add_subdirectory(test_problems)
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..5ab7695
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,504 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL. It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+ This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it. You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+ When we speak of free software, we are referring to freedom of use,
+not price. Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+ To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+ For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you. You must make sure that they, too, receive or can get the source
+code. If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it. And you must show them these terms so they know their rights.
+
+ We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+ To protect each distributor, we want to make it very clear that
+there is no warranty for the free library. Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+ Finally, software patents pose a constant threat to the existence of
+any free program. We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder. Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+ Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License. This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License. We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+ When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library. The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom. The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+ We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License. It also provides other free software developers Less
+of an advantage over competing non-free programs. These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries. However, the Lesser license provides advantages in certain
+special circumstances.
+
+ For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard. To achieve this, non-free programs must be
+allowed to use the library. A more frequent case is that a free
+library does the same job as widely used non-free libraries. In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+ In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software. For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+ Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+ The precise terms and conditions for copying, distribution and
+modification follow. Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library". The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+ GNU LESSER GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+ A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+ The "Library", below, refers to any such software library or work
+which has been distributed under these terms. A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language. (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+ "Source code" for a work means the preferred form of the work for
+making modifications to it. For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+ Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it). Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+ 1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+ You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+ 2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) The modified work must itself be a software library.
+
+ b) You must cause the files modified to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ c) You must cause the whole of the work to be licensed at no
+ charge to all third parties under the terms of this License.
+
+ d) If a facility in the modified Library refers to a function or a
+ table of data to be supplied by an application program that uses
+ the facility, other than as an argument passed when the facility
+ is invoked, then you must make a good faith effort to ensure that,
+ in the event an application does not supply such function or
+ table, the facility still operates, and performs whatever part of
+ its purpose remains meaningful.
+
+ (For example, a function in a library to compute square roots has
+ a purpose that is entirely well-defined independent of the
+ application. Therefore, Subsection 2d requires that any
+ application-supplied function or table used by this function must
+ be optional: if the application does not supply it, the square
+ root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library. To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License. (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.) Do not make any other change in
+these notices.
+
+ Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+ This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+ 4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+ If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library". Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+ However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library". The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+ When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library. The
+threshold for this to be true is not precisely defined by law.
+
+ If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work. (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+ Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+ 6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+ You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License. You must supply a copy of this License. If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License. Also, you must do one
+of these things:
+
+ a) Accompany the work with the complete corresponding
+ machine-readable source code for the Library including whatever
+ changes were used in the work (which must be distributed under
+ Sections 1 and 2 above); and, if the work is an executable linked
+ with the Library, with the complete machine-readable "work that
+ uses the Library", as object code and/or source code, so that the
+ user can modify the Library and then relink to produce a modified
+ executable containing the modified Library. (It is understood
+ that the user who changes the contents of definitions files in the
+ Library will not necessarily be able to recompile the application
+ to use the modified definitions.)
+
+ b) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (1) uses at run time a
+ copy of the library already present on the user's computer system,
+ rather than copying library functions into the executable, and (2)
+ will operate properly with a modified version of the library, if
+ the user installs one, as long as the modified version is
+ interface-compatible with the version that the work was made with.
+
+ c) Accompany the work with a written offer, valid for at
+ least three years, to give the same user the materials
+ specified in Subsection 6a, above, for a charge no more
+ than the cost of performing this distribution.
+
+ d) If distribution of the work is made by offering access to copy
+ from a designated place, offer equivalent access to copy the above
+ specified materials from the same place.
+
+ e) Verify that the user has already received a copy of these
+ materials or that you have already sent this user a copy.
+
+ For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it. However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+ It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system. Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+ 7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+ a) Accompany the combined library with a copy of the same work
+ based on the Library, uncombined with any other library
+ facilities. This must be distributed under the terms of the
+ Sections above.
+
+ b) Give prominent notice with the combined library of the fact
+ that part of it is a work based on the Library, and explaining
+ where to find the accompanying uncombined form of the same work.
+
+ 8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License. Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License. However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+ 9. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Library or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+ 10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+ 11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all. For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded. In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+ 13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+ 14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission. For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this. Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+ NO WARRANTY
+
+ 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Libraries
+
+ If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change. You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+ To apply these terms, attach the following notices to the library. It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the library's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the
+ library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+ <signature of Ty Coon>, 1 April 1990
+ Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..b7a438f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,257 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ./Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/avx/kernel_dgecp_lib4.o ./auxiliary/avx2/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib8.o
+OBJS += ./auxiliary/m_aux_lib48.o
+# kernels
+OBJS += ./kernel/avx2/kernel_dgemm_12x4_lib4.o ./kernel/avx2/kernel_dgemm_8x8_lib4.o ./kernel/avx2/kernel_dgemm_8x4_lib4.o ./kernel/avx2/kernel_dgemm_4x4_lib4.o ./kernel/avx/kernel_dgemm_diag_lib4.o ./kernel/avx2/kernel_dgemv_8_lib4.o ./kernel/avx/kernel_dgemv_4_lib4.o ./kernel/avx2/kernel_dsymv_6_lib4.o ./kernel/avx2/kernel_dgetrf_pivot_4_lib4.o ./kernel/avx/kernel_dgeqrf_4_lib4.o kernel/avx2/kernel_dgebp_lib4.o kernel/avx2/kernel_dgelqf_4_lib4.o
+OBJS += ./kernel/avx2/kernel_sgemm_24x4_lib8.o ./kernel/avx2/kernel_sgemm_16x4_lib8.o ./kernel/avx2/kernel_sgemm_8x8_lib8.o ./kernel/avx2/kernel_sgemm_8x4_lib8.o ./kernel/avx/kernel_sgemm_diag_lib8.o ./kernel/avx/kernel_sgecp_lib8.o ./kernel/avx/kernel_sgetr_lib8.o ./kernel/avx/kernel_sgead_lib8.o ./kernel/avx/kernel_sgesc_lib8.o ./kernel/avx/kernel_sgemv_8_lib8.o ./kernel/avx/kernel_sgemv_4_lib8.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib8.o ./blas/s_blas2_lib8.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib8.o ./blas/s_blas3_diag_lib8.o ./blas/s_lapack_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/avx/kernel_dgecp_lib4.o ./auxiliary/avx/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib8.o
+OBJS += ./auxiliary/m_aux_lib48.o
+# kernels
+OBJS += ./kernel/avx/kernel_dgemm_8x4_lib4.o ./kernel/avx/kernel_dgemm_4x4_lib4.o ./kernel/avx/kernel_dgemm_diag_lib4.o ./kernel/avx/kernel_dgemv_12_lib4.o ./kernel/avx/kernel_dgemv_8_lib4.o ./kernel/avx/kernel_dgemv_4_lib4.o ./kernel/avx/kernel_dsymv_6_lib4.o ./kernel/avx/kernel_dgetrf_pivot_4_lib4.o ./kernel/avx/kernel_dgeqrf_4_lib4.o kernel/avx/kernel_dgebp_lib4.o
+OBJS += ./kernel/avx/kernel_sgemm_16x4_lib8.o ./kernel/avx/kernel_sgemm_8x8_lib8.o ./kernel/avx/kernel_sgemm_8x4_lib8.o ./kernel/avx/kernel_sgecp_lib8.o ./kernel/avx/kernel_sgemm_diag_lib8.o ./kernel/avx/kernel_sgetr_lib8.o ./kernel/avx/kernel_sgead_lib8.o ./kernel/avx/kernel_sgesc_lib8.o ./kernel/avx/kernel_sgemv_8_lib8.o ./kernel/avx/kernel_sgemv_4_lib8.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib8.o ./blas/s_blas2_lib8.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib8.o ./blas/s_blas3_diag_lib8.o ./blas/s_lapack_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/sse3/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/fma/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/armv8a/kernel_dgemm_8x4_lib4.o ./kernel/armv8a/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/armv8a/kernel_sgemm_16x4_lib4.o ./kernel/armv8a/kernel_sgemm_12x4_lib4.o ./kernel/armv8a/kernel_sgemm_8x8_lib4.o ./kernel/armv8a/kernel_sgemm_8x4_lib4.o ./kernel/armv8a/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/armv7a/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/armv7a/kernel_sgemm_12x4_lib4.o ./kernel/armv7a/kernel_sgemm_8x4_lib4.o ./kernel/armv7a/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+# aux
+OBJS += ./auxiliary/d_aux_lib.o
+OBJS += ./auxiliary/s_aux_lib.o
+OBJS += ./auxiliary/m_aux_lib.o
+# blas
+OBJS += ./blas/d_blas1_lib.o ./blas/d_blas2_lib.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib.o ./blas/d_blas3_diag_lib.o ./blas/d_lapack_lib.o
+OBJS += ./blas/s_blas1_lib.o ./blas/s_blas2_lib.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib.o ./blas/s_blas3_diag_lib.o ./blas/s_lapack_lib.o
+
+endif # LA choice
+
+ifeq ($(EXT_DEP), 1)
+# ext dep
+OBJS += ./auxiliary/d_aux_ext_dep_lib.o
+OBJS += ./auxiliary/s_aux_ext_dep_lib.o
+OBJS += ./auxiliary/v_aux_ext_dep_lib.o
+OBJS += ./auxiliary/i_aux_ext_dep_lib.o
+endif
+
+
+
+all: clean static_library
+
+static_library: target
+ ( cd auxiliary; $(MAKE) obj)
+ ( cd kernel; $(MAKE) obj)
+ ( cd blas; $(MAKE) obj)
+ ar rcs libblasfeo.a $(OBJS)
+ cp libblasfeo.a ./lib/
+ @echo
+ @echo " libblasfeo.a static library build complete."
+ @echo
+
+shared_library: target
+ ( cd auxiliary; $(MAKE) obj)
+ ( cd kernel; $(MAKE) obj)
+ ( cd blas; $(MAKE) obj)
+ gcc -shared -o libblasfeo.so $(OBJS)
+ cp libblasfeo.so ./lib/
+ @echo
+ @echo " libblasfeo.so shared library build complete."
+ @echo
+
+target:
+ touch ./include/blasfeo_target.h
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+ echo "#ifndef TARGET_X64_INTEL_HASWELL" > ./include/blasfeo_target.h
+ echo "#define TARGET_X64_INTEL_HASWELL" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+ echo "#ifndef TARGET_X64_INTEL_SANDY_BRIDGE" > ./include/blasfeo_target.h
+ echo "#define TARGET_X64_INTEL_SANDY_BRIDGE" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+ echo "#ifndef TARGET_X64_INTEL_CORE" > ./include/blasfeo_target.h
+ echo "#define TARGET_X64_INTEL_CORE" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+ echo "#ifndef TARGET_X64_AMD_BULLDOZER" > ./include/blasfeo_target.h
+ echo "#define TARGET_X64_AMD_BULLDOZER" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), GENERIC)
+ echo "#ifndef TARGET_GENERIC" > ./include/blasfeo_target.h
+ echo "#define TARGET_GENERIC" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+ echo "#ifndef TARGET_ARMV7A_ARM_CORTEX_A15" > ./include/blasfeo_target.h
+ echo "#define TARGET_ARMV7A_ARM_CORTEX_A15" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(LA), HIGH_PERFORMANCE)
+ echo "#ifndef LA_HIGH_PERFORMANCE" >> ./include/blasfeo_target.h
+ echo "#define LA_HIGH_PERFORMANCE" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(LA), BLAS)
+ echo "#ifndef LA_BLAS" >> ./include/blasfeo_target.h
+ echo "#define LA_BLAS" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(LA), REFERENCE)
+ echo "#ifndef LA_REFERENCE" >> ./include/blasfeo_target.h
+ echo "#define LA_REFERENCE" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(EXT_DEP), 1)
+ echo "#ifndef EXT_DEP" >> ./include/blasfeo_target.h
+ echo "#define EXT_DEP" >> ./include/blasfeo_target.h
+ echo "#endif" >> ./include/blasfeo_target.h
+endif
+
+install_static:
+ mkdir -p $(PREFIX)/blasfeo
+ mkdir -p $(PREFIX)/blasfeo/lib
+ cp -f libblasfeo.a $(PREFIX)/blasfeo/lib/
+ mkdir -p $(PREFIX)/blasfeo/include
+ cp -f ./include/*.h $(PREFIX)/blasfeo/include/
+
+install_shared:
+ mkdir -p $(PREFIX)/blasfeo
+ mkdir -p $(PREFIX)/blasfeo/lib
+ cp -f libblasfeo.so $(PREFIX)/blasfeo/lib/
+ mkdir -p $(PREFIX)/blasfeo/include
+ cp -f ./include/*.h $(PREFIX)/blasfeo/include/
+
+test_problem:
+ cp libblasfeo.a ./test_problems/libblasfeo.a
+ make -C test_problems obj
+ @echo
+ @echo " Test problem build complete."
+ @echo
+
+run:
+ ./test_problems/test.out
+
+clean:
+ rm -f libblasfeo.a
+ rm -f libblasfeo.so
+ rm -f ./lib/libblasfeo.a
+ rm -f ./lib/libblasfeo.so
+ make -C auxiliary clean
+ make -C kernel clean
+ make -C blas clean
+ make -C test_problems clean
+ make -C examples clean
+
diff --git a/Makefile.rule b/Makefile.rule
new file mode 100644
index 0000000..200721e
--- /dev/null
+++ b/Makefile.rule
@@ -0,0 +1,183 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+# Target architecture
+# X64_INTEL_HASWELL : x86_64 architecture with AVX2 and FMA ISA (64 bit OS) code optimized for Intel Haswell and Intel Skylake architectures.
+# X64_INTEL_SANDY_BRIDGE : x86_64 architecture with AVX ISA (64 bit OS) code optimized for Intel Sandy-Bridge architecture.
+# X64_INTEL_CORE : x86_64 architecture with SSE3 (64 bit OS) code optimized for Intel Core archiecture.
+# X64_AMD_BULLDOZER : x86_64 architecture with AVX and FMA ISA (64 bit OS) code optimized for AMD Bulldozer.
+# ARMV7A_ARM_CORTEX_A15 : ARMv7A architecture with NEON-VFPv4 ISA (32 bit OS) code optimized for ARM Cortex A15.
+# GENERIC : generic c99 code
+TARGET = X64_INTEL_HASWELL
+#TARGET = X64_INTEL_SANDY_BRIDGE
+#TARGET = X64_INTEL_CORE
+#TARGET = X64_AMD_BULLDOZER
+#TARGET = ARMV8A_ARM_CORTEX_A57
+#TARGET = ARMV7A_ARM_CORTEX_A15
+#TARGET = GENERIC
+
+# Linear Algebra library
+LA = HIGH_PERFORMANCE
+#LA = REFERENCE
+#LA = BLAS
+
+# BLAS and LAPACK version (for LA=BLAS)
+REF_BLAS = 0
+#REF_BLAS = OPENBLAS
+#REF_BLAS = NETLIB
+#REF_BLAS = MKL
+#REF_BLAS = BLIS
+#REF_BLAS = ATLAS
+
+# Compile auxiliary functions with external dependencies (for memory allocation and printing)
+#EXT_DEP = 0
+EXT_DEP = 1
+
+# Enable on-line checks for matrix and vector dimensions
+RUNTIME_CHECKS = 0
+#RUNTIME_CHECKS = 1
+
+# Operating system
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+ OS = LINUX
+endif
+ifeq ($(UNAME_S),Darwin)
+ OS = MAC
+endif
+#OS = LINUX
+#OS = MAC
+#OS = WINDOWS
+
+# C Compiler
+CC = gcc
+#CC = clang
+#CC = x86_64-w64-mingw32-gcc
+
+# Installation directory
+PREFIX = /opt
+
+# Macro level (code size vs performance in assembly kernels): 0 (no macro), 1 (all macro but gemm kernel), 2 (all macro)
+MACRO_LEVEL = 0
+
+# compiler / assembler / linker flags
+CFLAGS =
+ASFLAGS =
+LDFLAGS =
+
+# Optimization flags
+CFLAGS += -O2 -fPIC
+
+# Debugging flags
+#CFLAGS += -g #-Wall -pedantic -Wfloat-equal #-pg
+#ASFLAGS += -g
+
+# Definirions
+ifeq ($(LA), HIGH_PERFORMANCE)
+CFLAGS += -DLA_HIGH_PERFORMANCE
+endif
+ifeq ($(LA), REFERENCE)
+CFLAGS += -DLA_REFERENCE
+endif
+ifeq ($(LA), BLAS)
+CFLAGS += -DLA_BLAS
+endif
+
+ifeq ($(RUNTIME_CHECKS), 1)
+CFLAGS += -DDIM_CHECK
+endif
+
+ifeq ($(EXT_DEP), 1)
+CFLAGS += -DEXT_DEP
+endif
+
+ifeq ($(MACRO_LEVEL), 1)
+ASFLAGS += -DMACRO_LEVEL=1
+endif
+ifeq ($(MACRO_LEVEL), 2)
+ASFLAGS += -DMACRO_LEVEL=2
+endif
+
+ifeq ($(OS), LINUX)
+CFLAGS += -DOS_LINUX
+ASFLAGS += -DOS_LINUX
+endif
+ifeq ($(OS), MAC)
+CFLAGS += -DOS_MAC
+ASFLAGS += -DOS_MAC
+endif
+ifeq ($(OS), WINDOWS)
+CFLAGS += -DOS_WINDOWS
+ASFLAGS += -DOS_WINDOWS
+endif
+
+ifeq ($(REF_BLAS), 0)
+CFLAGS +=
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+CFLAGS += -DREF_BLAS_OPENBLAS -I/opt/openblas/include
+endif
+ifeq ($(REF_BLAS), BLIS)
+CFLAGS += -DREF_BLAS_BLIS -std=c99
+endif
+ifeq ($(REF_BLAS), NETLIB)
+CFLAGS += -DREF_BLAS_NETLIB
+endif
+ifeq ($(REF_BLAS), MKL)
+CFLAGS += -DREF_BLAS_MKL -m64 -I/opt/intel/mkl/include
+endif
+ifeq ($(REF_BLAS), ATLAS)
+CFLAGS += -DREF_BLAS_ATLAS
+endif
+
+# Architecture-specific flags
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+CFLAGS += -m64 -mavx2 -mfma -DTARGET_X64_INTEL_HASWELL
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+CFLAGS += -m64 -mavx -DTARGET_X64_INTEL_SANDY_BRIDGE
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+CFLAGS += -m64 -msse3 -DTARGET_X64_INTEL_CORE
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+CFLAGS += -m64 -mavx -mfma -DTARGET_X64_AMD_BULLDOZER
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+CFLAGS += -march=armv8-a+crc+crypto+fp+simd -DTARGET_ARMV8A_ARM_CORTEX_A57
+ASFLAGS += -DTARGET_ARMV7A_ARM_CORTEX_A15
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+CFLAGS += -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a15 -DTARGET_ARMV7A_ARM_CORTEX_A15
+ASFLAGS += -mfpu=neon-vfpv4 -DTARGET_ARMV7A_ARM_CORTEX_A15
+endif
+ifeq ($(TARGET), GENERIC)
+CFLAGS += -DTARGET_GENERIC
+endif
+
+
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..685a2c8
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,25 @@
+BLASFEO - BLAS For Embedded Optimization
+
+BLASFEO provides a set of linear algebra routines optimized for use in embedded optimization.
+It is for example employed in the Model Predictive Control software package HPMPC.
+
+BLASFEO provides three implementations of each linear algebra routine (LA):
+- HIGH_PERFORMANCE: a high-performance implementation hand-optimized for different computer architectures.
+- REFERENCE: a lightly-optimized version, coded entirely in C withou assumptions about the computer architecture.
+- BLAS: a wrapper to BLAS and LAPACK routines.
+
+The currently supported compter architectures (TARGET) are:
+- X64_INTEL_HASWELL: Intel Haswell architecture or newer, AVX2 and FMA ISA, 64-bit OS.
+- X64_INTEL_SANDY_BRIDGE: Intel Sandy-Bridge architecture or newer, AVX ISA, 64-bit OS.
+- X64_INTEL_CORE: Intel Core architecture or newer, SSE3 ISA, 64-bit OS.
+- X64_AMD_BULLDOZER: AMD Bulldozer architecture, AVX and FMA ISAs, 64-bit OS.
+- ARMV78_ARM_CORTEX_A57: ARMv78 architecture, VFPv4 and NEONv2 ISAs, 64-bit OS.
+- ARMV7A_ARM_CORTEX_A15: ARMv7A architecture, VFPv3 and NEON ISAs, 32-bit OS.
+- GENERIC: generic target, coded in C, giving better performance if the architecture provides more than 16 scalar FP registers (e.g. many RISC such as ARM).
+
+The optimized linear algebra kernels are currently provided for OS_LINUX (x86_64 64-bit, ARMv8A 64-bit, ARMv7A 32-bit), OS_WINDOWS (x86_64 64-bit) and OS_MAC (x86_64 64-bit).
+
+BLASFEO employes structures to describe matrices (d_strmat) and vectors (d_strvec), defined in include/blasfeo_common.h.
+The actual implementation of d_strmat and d_strvec depends on the LA and TARGET choice.
+
+More information about BLASFEO can be found in the ArXiv paper at https://arxiv.org/abs/1704.02457
diff --git a/TODOlist.txt b/TODOlist.txt
new file mode 100644
index 0000000..bba5ee0
--- /dev/null
+++ b/TODOlist.txt
@@ -0,0 +1,7 @@
+- syrk_potrf_ln_mn
+- alpha for trsm
+- kernels and _mn_ version of trmv
+- kernel dsymv dgemv_nt 4 avx
+- remove n from trmv
+- store_gen in single precision
+- clean target.h and create it also from cmake (see "file")
diff --git a/auxiliary/Makefile b/auxiliary/Makefile
new file mode 100644
index 0000000..d1242bd
--- /dev/null
+++ b/auxiliary/Makefile
@@ -0,0 +1,124 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_aux_lib.o
+OBJS += s_aux_lib.o
+OBJS += m_aux_lib.o
+
+endif # LA choice
+
+ifeq ($(EXT_DEP), 1)
+#ext dep
+OBJS += d_aux_ext_dep_lib.o
+OBJS += s_aux_ext_dep_lib.o
+OBJS += v_aux_ext_dep_lib.o
+OBJS += i_aux_ext_dep_lib.o
+endif
+
+obj: $(OBJS)
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+ ( cd avx2; $(MAKE) obj)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), GENERIC)
+ ( cd c99; $(MAKE) obj)
+endif
+
+
+clean:
+ rm -f *.o
+ make -C avx2 clean
+ make -C avx clean
+ make -C c99 clean
diff --git a/auxiliary/avx/Makefile b/auxiliary/avx/Makefile
new file mode 100644
index 0000000..84e0154
--- /dev/null
+++ b/auxiliary/avx/Makefile
@@ -0,0 +1,50 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
diff --git a/auxiliary/avx/kernel_dgecp_lib4.c b/auxiliary/avx/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..4bc8c9a
--- /dev/null
+++ b/auxiliary/avx/kernel_dgecp_lib4.c
@@ -0,0 +1,3024 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*3], a_0 );
+
+ A0 += 16;
+ B0 += 16;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*3], a_0 );
+
+ A1 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ A0 += 4;
+ B0 += 4;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ A1 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_sd( &A0[1+0*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[1+0*bs], c_0 );
+ c_0 = _mm_load_pd( &A0[2+0*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+0*bs], c_0 );
+ a_0 = _mm256_load_pd( &A1[0+0*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+0*bs], a_0 );
+
+ c_0 = _mm_load_pd( &A0[2+1*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+1*bs], c_0 );
+ a_0 = _mm256_load_pd( &A1[0+1*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+1*bs], a_0 );
+
+ c_0 = _mm_load_sd( &A0[3+2*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+2*bs], c_0 );
+ a_0 = _mm256_load_pd( &A1[0+2*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+2*bs], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+3*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+3*bs], a_0 );
+
+ c_0 = _mm_load_sd( &A1[1+4*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[1+4*bs], c_0 );
+ c_0 = _mm_load_pd( &A1[2+4*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+4*bs], c_0 );
+
+ c_0 = _mm_load_pd( &A1[2+5*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+5*bs], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+6*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+6*bs], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0, a_1, a_2,
+ b_0, b_1;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_pd( &A0[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B0[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[1+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A0[3+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[1+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A1[1+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*2], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*3], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*3], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*3], c_0 );
+
+ c_0 = _mm_load_pd( &A1[2+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*4], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*4], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[2+bs*5], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*5], c_0 );
+
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*6] );
+ _mm_store_sd( &B1[3+bs*6], c_0 );
+
+ }
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0, a_1, a_2,
+ b_0, b_1;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_sd( &A0[3+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[1+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*0], c_0 );
+
+ c_0 = _mm_load_pd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*2], c_0 );
+
+ c_0 = _mm_load_pd( &A1[2+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*3], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*3], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[1+bs*4], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*4], c_0 );
+
+ c_0 = _mm_load_pd( &A2[0+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*5], c_0 );
+
+ c_0 = _mm_load_sd( &A2[1+bs*6] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*6], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0, a_1, a_2,
+ b_0, b_1;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B0[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[3+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[3+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A1[3+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*2], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*3], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*3], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*3], c_0 );
+
+ c_0 = _mm_load_pd( &A2[0+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*4], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*4], c_0 );
+
+ c_0 = _mm_load_sd( &A2[1+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[2+bs*5], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*5], c_0 );
+
+ c_0 = _mm_load_sd( &A2[2+bs*6] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*6], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m256d
+ alpha_0,
+ a_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_sd( &A[1+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*0], c_0 );
+
+ c_0 = _mm_load_pd( &A[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A[3+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ alpha_0,
+ a_0, a_1,
+ b_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_pd( &A0[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A0[3+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ alpha_0,
+ a_0, a_1,
+ b_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_sd( &A0[3+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*0], c_0 );
+
+ c_0 = _mm_load_pd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ alpha_0,
+ a_0, a_1,
+ b_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+ }
+
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ alpha_0,
+ a_0, a_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ a_1 = _mm_load_sd( &A[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ a_1 = _mm_load_sd( &A[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ a_1 = _mm_load_sd( &A[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ a_0 = _mm_loadu_pd( &A[1+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*1], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ alpha_0,
+ a_0, a_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*1] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*2] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*3] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[1+bs*0], a_0 );
+ a_0 = _mm_load_sd( &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A1[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*1], a_0 );
+
+ }
+
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ alpha_0,
+ a_0, a_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*1], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*2], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ a_0 = _mm_loadu_pd( &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A1[1+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*1], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ alpha_0,
+ a_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ a_0 = _mm_load_sd( &A[1+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[1+bs*0], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ alpha_0,
+ a_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ a_0 = _mm_load_sd( &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[1+bs*0], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ alpha_0,
+ a_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*3], a_0 );
+
+ A0 += 16;
+ B0 += 16;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*1] );
+ c_0 = _mm256_load_pd( &B1[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*2] );
+ c_0 = _mm256_load_pd( &B1[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*3] );
+ c_0 = _mm256_load_pd( &B1[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*3], a_0 );
+
+ A1 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ A0 += 4;
+ B0 += 4;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ A1 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, a_1, a_2,
+ b_0, b_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*1] );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*2] );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*3] );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, a_1, a_2,
+ b_0, b_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ c_1 = _mm256_load_pd( &B1[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ c_1 = _mm256_load_pd( &B1[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ c_1 = _mm256_load_pd( &B1[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, a_1, a_2,
+ b_0, b_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ c_1 = _mm256_load_pd( &B1[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ c_1 = _mm256_load_pd( &B1[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ c_1 = _mm256_load_pd( &B1[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m256d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*1] );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*2] );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*3] );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ a_0, a_1,
+ b_0,
+ alpha_0, c_0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ a_0, a_1,
+ b_0,
+ alpha_0, c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ a_0, a_1,
+ b_0,
+ alpha_0, c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ a_0, a_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ a_1 = _mm_load_sd( &A[2+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ c_1 = _mm_load_sd( &B[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ a_1 = _mm_load_sd( &A[2+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ c_1 = _mm_load_sd( &B[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ a_1 = _mm_load_sd( &A[2+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ c_1 = _mm_load_sd( &B[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ a_0, a_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+ a_1 = _mm_load_sd( &A1[0+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ c_1 = _mm_load_sd( &B[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+ a_1 = _mm_load_sd( &A1[0+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ c_1 = _mm_load_sd( &B[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+ a_1 = _mm_load_sd( &A1[0+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ c_1 = _mm_load_sd( &B[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ a_0, a_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ c_1 = _mm_loadu_pd( &B[1+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+ c_0 = _mm_load_sd( &B[0+bs*1] );
+ c_1 = _mm_loadu_pd( &B[1+bs*1] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+ _mm_storeu_pd( &B[1+bs*1], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+ c_0 = _mm_load_sd( &B[0+bs*2] );
+ c_1 = _mm_loadu_pd( &B[1+bs*2] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+ _mm_storeu_pd( &B[1+bs*2], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+ c_0 = _mm_load_sd( &B[0+bs*3] );
+ c_1 = _mm_loadu_pd( &B[1+bs*3] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+ _mm_storeu_pd( &B[1+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ c_1 = _mm_loadu_pd( &B[1+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm_load_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*1] );
+ c_0 = _mm_load_sd( &B[0+bs*1] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*2] );
+ c_0 = _mm_load_sd( &B[0+bs*2] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*3] );
+ c_0 = _mm_load_sd( &B[0+bs*3] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+void kernel_dgeset_4_lib4(int kmax, double alpha, double *A)
+ {
+
+ int k;
+
+ __m256d
+ a0;
+
+ a0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+ _mm256_store_pd( &A[4], a0 );
+ _mm256_store_pd( &A[8], a0 );
+ _mm256_store_pd( &A[12], a0 );
+
+ A += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+
+ A += 4;
+
+ }
+
+ }
+
+
+// A lower triangular
+void kernel_dtrset_4_lib4(int kmax, double alpha, double *A)
+ {
+
+ int k;
+
+ __m256d
+ a0;
+
+ a0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+ _mm256_store_pd( &A[4], a0 );
+ _mm256_store_pd( &A[8], a0 );
+ _mm256_store_pd( &A[12], a0 );
+
+ A += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+
+ A += 4;
+
+ }
+
+ // final 4x4 triangle
+ _mm256_store_pd( &A[0], a0 );
+
+ _mm_store_sd( &A[5], _mm256_castpd256_pd128( a0 ) );
+ _mm_store_pd( &A[6], _mm256_castpd256_pd128( a0 ) );
+
+ _mm_store_pd( &A[10], _mm256_castpd256_pd128( a0 ) );
+
+ _mm_store_sd( &A[15], _mm256_castpd256_pd128( a0 ) );
+
+ }
+
+
+
diff --git a/auxiliary/avx/kernel_dgetr_lib4.c b/auxiliary/avx/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..29d095b
--- /dev/null
+++ b/auxiliary/avx/kernel_dgetr_lib4.c
@@ -0,0 +1,490 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ __m256d
+ alph,
+ v0, v1, v2, v3,
+ v4, v5, v6, v7;
+
+ alph = _mm256_broadcast_sd( &alpha );
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-7; k+=8)
+ {
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ v0 = _mm256_mul_pd( v0, alph );
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ v2 = _mm256_mul_pd( v2, alph );
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ v1 = _mm256_mul_pd( v1, alph );
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ v3 = _mm256_mul_pd( v3, alph );
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ v0 = _mm256_mul_pd( v0, alph );
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ v2 = _mm256_mul_pd( v2, alph );
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ v1 = _mm256_mul_pd( v1, alph );
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ v3 = _mm256_mul_pd( v3, alph );
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ v0 = _mm256_mul_pd( v0, alph );
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ v2 = _mm256_mul_pd( v2, alph );
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ v1 = _mm256_mul_pd( v1, alph );
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ v3 = _mm256_mul_pd( v3, alph );
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+ {
+ const int ps = 4;
+ __m256d
+ v0, v1, v2, v3, v4, v5, v6, v7;
+ int k;
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_load_pd( &A[0+ps*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+ps*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+ps*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+ps*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &B[0+ps*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &B[0+ps*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &B[0+ps*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &B[0+ps*3], v3 );
+
+ A += ps*sda;
+ B += ps*ps;
+ }
+ for( ; k<kmax; k++)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[1+ps*0] = A[0+ps*1];
+ B[2+ps*0] = A[0+ps*2];
+ B[3+ps*0] = A[0+ps*3];
+
+ A += 1;
+ B += ps;
+ }
+ return;
+ }
+
diff --git a/auxiliary/avx2/Makefile b/auxiliary/avx2/Makefile
new file mode 100644
index 0000000..463ebf5
--- /dev/null
+++ b/auxiliary/avx2/Makefile
@@ -0,0 +1,46 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
diff --git a/auxiliary/avx2/kernel_dgetr_lib4.c b/auxiliary/avx2/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..14d00ef
--- /dev/null
+++ b/auxiliary/avx2/kernel_dgetr_lib4.c
@@ -0,0 +1,756 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+
+// TODO tri !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A0, int sda, double *C, int sdc)
+ {
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ __m256d
+ alph,
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, va, vb, vc, vd, ve, vf;
+
+ alph = _mm256_broadcast_sd( &alpha );
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A0[0+bs*0];
+ C[0+bs*1] = alpha * A0[1+bs*0];
+ C[0+bs*2] = alpha * A0[2+bs*0];
+ C[0+bs*3] = alpha * A0[3+bs*0];
+
+ C[0+bs*4] = alpha * A1[0+bs*0];
+ C[0+bs*5] = alpha * A1[1+bs*0];
+ C[0+bs*6] = alpha * A1[2+bs*0];
+ C[0+bs*7] = alpha * A1[3+bs*0];
+
+ C += 1;
+ A0 += bs;
+ A1 += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for(; k<kmax-7; k+=8)
+ {
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A0 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A1 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*4], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*5], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*6], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*7], v7 );
+
+ C += sdc*bs;
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A0 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A1 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*4], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*5], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*6], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*7], v7 );
+
+ C += sdc*bs;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A0 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A1 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*4], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*5], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*6], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*7], v7 );
+
+ C += sdc*bs;
+
+ }
+
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A0[0+bs*0];
+ C[0+bs*1] = alpha * A0[1+bs*0];
+ C[0+bs*2] = alpha * A0[2+bs*0];
+ C[0+bs*3] = alpha * A0[3+bs*0];
+
+ C[0+bs*4] = alpha * A1[0+bs*0];
+ C[0+bs*5] = alpha * A1[1+bs*0];
+ C[0+bs*6] = alpha * A1[2+bs*0];
+ C[0+bs*7] = alpha * A1[3+bs*0];
+
+ C += 1;
+ A0 += bs;
+ A1 += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ __m256d
+ alph,
+ v0, v1, v2, v3,
+ v4, v5, v6, v7;
+
+ alph = _mm256_broadcast_sd( &alpha );
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-7; k+=8)
+ {
+
+#if 1
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ C += sdc*bs;
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 );
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 );
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 );
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 );
+
+ A += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ C += sdc*bs;
+
+#else // TODO alpha
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+#endif
+
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+
+#if 1
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ C += sdc*bs;
+
+#else
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+#endif
+
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+ {
+ const int ps = 4;
+ __m256d
+ v0, v1, v2, v3, v4, v5, v6, v7;
+ int k;
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ _mm256_store_pd( &B[0+ps*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ _mm256_store_pd( &B[0+ps*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ _mm256_store_pd( &B[0+ps*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ _mm256_store_pd( &B[0+ps*3], v7 );
+
+ A += ps*sda;
+ B += ps*ps;
+ }
+ for( ; k<kmax; k++)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[1+ps*0] = A[0+ps*1];
+ B[2+ps*0] = A[0+ps*2];
+ B[3+ps*0] = A[0+ps*3];
+
+ A += 1;
+ B += ps;
+ }
+ return;
+ }
+
diff --git a/auxiliary/c99/Makefile b/auxiliary/c99/Makefile
new file mode 100644
index 0000000..6e9ea7b
--- /dev/null
+++ b/auxiliary/c99/Makefile
@@ -0,0 +1,77 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS +=
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS +=
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
diff --git a/auxiliary/c99/kernel_dgecp_lib4.c b/auxiliary/c99/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..e883072
--- /dev/null
+++ b/auxiliary/c99/kernel_dgecp_lib4.c
@@ -0,0 +1,1261 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+ B[3+bs*0] = alpha*A[3+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+ B[1+bs*1] = alpha*A[1+bs*1];
+ B[2+bs*1] = alpha*A[2+bs*1];
+ B[3+bs*1] = alpha*A[3+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+ B[1+bs*2] = alpha*A[1+bs*2];
+ B[2+bs*2] = alpha*A[2+bs*2];
+ B[3+bs*2] = alpha*A[3+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+ B[1+bs*3] = alpha*A[1+bs*3];
+ B[2+bs*3] = alpha*A[2+bs*3];
+ B[3+bs*3] = alpha*A[3+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+ B[3+bs*0] = alpha*A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+ B[3+bs*0] = alpha*A[3+bs*0];
+
+ B[2+bs*1] = alpha*A[2+bs*1];
+ B[3+bs*1] = alpha*A[3+bs*1];
+
+ B[3+bs*2] = alpha*A[3+bs*2];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[1+bs*0];
+ B[1+bs*0] = alpha*A0[2+bs*0];
+ B[2+bs*0] = alpha*A0[3+bs*0];
+ B[3+bs*0] = alpha*A1[0+bs*0];
+
+ B[0+bs*1] = alpha*A0[1+bs*1];
+ B[1+bs*1] = alpha*A0[2+bs*1];
+ B[2+bs*1] = alpha*A0[3+bs*1];
+ B[3+bs*1] = alpha*A1[0+bs*1];
+
+ B[0+bs*2] = alpha*A0[1+bs*2];
+ B[1+bs*2] = alpha*A0[2+bs*2];
+ B[2+bs*2] = alpha*A0[3+bs*2];
+ B[3+bs*2] = alpha*A1[0+bs*2];
+
+ B[0+bs*3] = alpha*A0[1+bs*3];
+ B[1+bs*3] = alpha*A0[2+bs*3];
+ B[2+bs*3] = alpha*A0[3+bs*3];
+ B[3+bs*3] = alpha*A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[1+bs*0];
+ B[1+bs*0] = alpha*A0[2+bs*0];
+ B[2+bs*0] = alpha*A0[3+bs*0];
+ B[3+bs*0] = alpha*A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ B[1+0*bs] = alpha*A0[2+0*bs];
+ B[2+0*bs] = alpha*A0[3+0*bs];
+ B[3+0*bs] = alpha*A1[0+0*bs];
+
+ B[2+1*bs] = alpha*A0[3+1*bs];
+ B[3+1*bs] = alpha*A1[0+1*bs];
+
+ B[3+2*bs] = alpha*A1[0+2*bs];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+ B[3+bs*0] = alpha*A1[1+bs*0];
+
+ B[0+bs*1] = alpha*A0[2+bs*1];
+ B[1+bs*1] = alpha*A0[3+bs*1];
+ B[2+bs*1] = alpha*A1[0+bs*1];
+ B[3+bs*1] = alpha*A1[1+bs*1];
+
+ B[0+bs*2] = alpha*A0[2+bs*2];
+ B[1+bs*2] = alpha*A0[3+bs*2];
+ B[2+bs*2] = alpha*A1[0+bs*2];
+ B[3+bs*2] = alpha*A1[1+bs*2];
+
+ B[0+bs*3] = alpha*A0[2+bs*3];
+ B[1+bs*3] = alpha*A0[3+bs*3];
+ B[2+bs*3] = alpha*A1[0+bs*3];
+ B[3+bs*3] = alpha*A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+ B[3+bs*0] = alpha*A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle}
+
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+ B[3+bs*0] = alpha*A1[1+bs*0];
+
+ B[2+bs*1] = alpha*A1[0+bs*1];
+ B[3+bs*1] = alpha*A1[1+bs*1];
+
+ B[3+bs*2] = alpha*A1[1+bs*2];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+ B[3+bs*0] = alpha*A1[2+bs*0];
+
+ B[0+bs*1] = alpha*A0[3+bs*1];
+ B[1+bs*1] = alpha*A1[0+bs*1];
+ B[2+bs*1] = alpha*A1[1+bs*1];
+ B[3+bs*1] = alpha*A1[2+bs*1];
+
+ B[0+bs*2] = alpha*A0[3+bs*2];
+ B[1+bs*2] = alpha*A1[0+bs*2];
+ B[2+bs*2] = alpha*A1[1+bs*2];
+ B[3+bs*2] = alpha*A1[2+bs*2];
+
+ B[0+bs*3] = alpha*A0[3+bs*3];
+ B[1+bs*3] = alpha*A1[0+bs*3];
+ B[2+bs*3] = alpha*A1[1+bs*3];
+ B[3+bs*3] = alpha*A1[2+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+ B[3+bs*0] = alpha*A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+ B[3+bs*0] = alpha*A1[2+bs*0];
+
+ B[2+bs*1] = alpha*A1[1+bs*1];
+ B[3+bs*1] = alpha*A1[2+bs*1];
+
+ B[3+bs*2] = alpha*A1[2+bs*2];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+ B[1+bs*1] = alpha*A[1+bs*1];
+ B[2+bs*1] = alpha*A[2+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+ B[1+bs*2] = alpha*A[1+bs*2];
+ B[2+bs*2] = alpha*A[2+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+ B[1+bs*3] = alpha*A[1+bs*3];
+ B[2+bs*3] = alpha*A[2+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+
+ B[2+bs*1] = alpha*A[2+bs*1];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+
+ B[0+bs*1] = alpha*A0[2+bs*1];
+ B[1+bs*1] = alpha*A0[3+bs*1];
+ B[2+bs*1] = alpha*A1[0+bs*1];
+
+ B[0+bs*2] = alpha*A0[2+bs*2];
+ B[1+bs*2] = alpha*A0[3+bs*2];
+ B[2+bs*2] = alpha*A1[0+bs*2];
+
+ B[0+bs*3] = alpha*A0[2+bs*3];
+ B[1+bs*3] = alpha*A0[3+bs*3];
+ B[2+bs*3] = alpha*A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+
+ B[2+bs*1] = alpha*A1[0+bs*1];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+
+ B[0+bs*1] = alpha*A0[3+bs*1];
+ B[1+bs*1] = alpha*A1[0+bs*1];
+ B[2+bs*1] = alpha*A1[1+bs*1];
+
+ B[0+bs*2] = alpha*A0[3+bs*2];
+ B[1+bs*2] = alpha*A1[0+bs*2];
+ B[2+bs*2] = alpha*A1[1+bs*2];
+
+ B[0+bs*3] = alpha*A0[3+bs*3];
+ B[1+bs*3] = alpha*A1[0+bs*3];
+ B[2+bs*3] = alpha*A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+
+ B[2+bs*1] = alpha*A1[1+bs*1];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+ B[1+bs*1] = alpha*A[1+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+ B[1+bs*2] = alpha*A[1+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+ B[1+bs*3] = alpha*A[1+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ B[1+bs*0] = alpha*A[1+bs*0];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+
+ B[0+bs*1] = alpha*A0[3+bs*1];
+ B[1+bs*1] = alpha*A1[0+bs*1];
+
+ B[0+bs*2] = alpha*A0[3+bs*2];
+ B[1+bs*2] = alpha*A1[0+bs*2];
+
+ B[0+bs*3] = alpha*A0[3+bs*3];
+ B[1+bs*3] = alpha*A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ B[1+bs*0] = alpha*A1[0+bs*0];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+ B[1+bs*1] += alpha * A[1+bs*1];
+ B[2+bs*1] += alpha * A[2+bs*1];
+ B[3+bs*1] += alpha * A[3+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+ B[1+bs*2] += alpha * A[1+bs*2];
+ B[2+bs*2] += alpha * A[2+bs*2];
+ B[3+bs*2] += alpha * A[3+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+ B[1+bs*3] += alpha * A[1+bs*3];
+ B[2+bs*3] += alpha * A[2+bs*3];
+ B[3+bs*3] += alpha * A[3+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ B[0+bs*1] += alpha * A0[1+bs*1];
+ B[1+bs*1] += alpha * A0[2+bs*1];
+ B[2+bs*1] += alpha * A0[3+bs*1];
+ B[3+bs*1] += alpha * A1[0+bs*1];
+
+ B[0+bs*2] += alpha * A0[1+bs*2];
+ B[1+bs*2] += alpha * A0[2+bs*2];
+ B[2+bs*2] += alpha * A0[3+bs*2];
+ B[3+bs*2] += alpha * A1[0+bs*2];
+
+ B[0+bs*3] += alpha * A0[1+bs*3];
+ B[1+bs*3] += alpha * A0[2+bs*3];
+ B[2+bs*3] += alpha * A0[3+bs*3];
+ B[3+bs*3] += alpha * A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ B[0+bs*1] += alpha * A0[2+bs*1];
+ B[1+bs*1] += alpha * A0[3+bs*1];
+ B[2+bs*1] += alpha * A1[0+bs*1];
+ B[3+bs*1] += alpha * A1[1+bs*1];
+
+ B[0+bs*2] += alpha * A0[2+bs*2];
+ B[1+bs*2] += alpha * A0[3+bs*2];
+ B[2+bs*2] += alpha * A1[0+bs*2];
+ B[3+bs*2] += alpha * A1[1+bs*2];
+
+ B[0+bs*3] += alpha * A0[2+bs*3];
+ B[1+bs*3] += alpha * A0[3+bs*3];
+ B[2+bs*3] += alpha * A1[0+bs*3];
+ B[3+bs*3] += alpha * A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ B[0+bs*1] += alpha * A0[3+bs*1];
+ B[1+bs*1] += alpha * A1[0+bs*1];
+ B[2+bs*1] += alpha * A1[1+bs*1];
+ B[3+bs*1] += alpha * A1[2+bs*1];
+
+ B[0+bs*2] += alpha * A0[3+bs*2];
+ B[1+bs*2] += alpha * A1[0+bs*2];
+ B[2+bs*2] += alpha * A1[1+bs*2];
+ B[3+bs*2] += alpha * A1[2+bs*2];
+
+ B[0+bs*3] += alpha * A0[3+bs*3];
+ B[1+bs*3] += alpha * A1[0+bs*3];
+ B[2+bs*3] += alpha * A1[1+bs*3];
+ B[3+bs*3] += alpha * A1[2+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+ B[1+bs*1] += alpha * A[1+bs*1];
+ B[2+bs*1] += alpha * A[2+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+ B[1+bs*2] += alpha * A[1+bs*2];
+ B[2+bs*2] += alpha * A[2+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+ B[1+bs*3] += alpha * A[1+bs*3];
+ B[2+bs*3] += alpha * A[2+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ B[0+bs*1] += alpha * A0[2+bs*1];
+ B[1+bs*1] += alpha * A0[3+bs*1];
+ B[2+bs*1] += alpha * A1[0+bs*1];
+
+ B[0+bs*2] += alpha * A0[2+bs*2];
+ B[1+bs*2] += alpha * A0[3+bs*2];
+ B[2+bs*2] += alpha * A1[0+bs*2];
+
+ B[0+bs*3] += alpha * A0[2+bs*3];
+ B[1+bs*3] += alpha * A0[3+bs*3];
+ B[2+bs*3] += alpha * A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ B[0+bs*1] += alpha * A0[3+bs*1];
+ B[1+bs*1] += alpha * A1[0+bs*1];
+ B[2+bs*1] += alpha * A1[1+bs*1];
+
+ B[0+bs*2] += alpha * A0[3+bs*2];
+ B[1+bs*2] += alpha * A1[0+bs*2];
+ B[2+bs*2] += alpha * A1[1+bs*2];
+
+ B[0+bs*3] += alpha * A0[3+bs*3];
+ B[1+bs*3] += alpha * A1[0+bs*3];
+ B[2+bs*3] += alpha * A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+ B[1+bs*1] += alpha * A[1+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+ B[1+bs*2] += alpha * A[1+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+ B[1+bs*3] += alpha * A[1+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ B[0+bs*1] += alpha * A0[3+bs*1];
+ B[1+bs*1] += alpha * A1[0+bs*1];
+
+ B[0+bs*2] += alpha * A0[3+bs*2];
+ B[1+bs*2] += alpha * A1[0+bs*2];
+
+ B[0+bs*3] += alpha * A0[3+bs*3];
+ B[1+bs*3] += alpha * A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
diff --git a/auxiliary/c99/kernel_dgetr_lib4.c b/auxiliary/c99/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..7d62277
--- /dev/null
+++ b/auxiliary/c99/kernel_dgetr_lib4.c
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+ C[2+bs*3] = alpha * A[3+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+ C[3+bs*3] = alpha * A[3+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+ {
+ const int ps = 4;
+ int k;
+ for(k=0; k<kmax-3; k+=4)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[0+ps*1] = A[1+ps*0];
+ B[0+ps*2] = A[2+ps*0];
+ B[0+ps*3] = A[3+ps*0];
+ //
+ B[1+ps*0] = A[0+ps*1];
+ B[1+ps*1] = A[1+ps*1];
+ B[1+ps*2] = A[2+ps*1];
+ B[1+ps*3] = A[3+ps*1];
+ //
+ B[2+ps*0] = A[0+ps*2];
+ B[2+ps*1] = A[1+ps*2];
+ B[2+ps*2] = A[2+ps*2];
+ B[2+ps*3] = A[3+ps*2];
+ //
+ B[3+ps*0] = A[0+ps*3];
+ B[3+ps*1] = A[1+ps*3];
+ B[3+ps*2] = A[2+ps*3];
+ B[3+ps*3] = A[3+ps*3];
+
+ A += ps*sda;
+ B += ps*ps;
+ }
+ for( ; k<kmax; k++)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[1+ps*0] = A[0+ps*1];
+ B[2+ps*0] = A[0+ps*2];
+ B[3+ps*0] = A[0+ps*3];
+
+ A += 1;
+ B += ps;
+ }
+ return;
+ }
+
diff --git a/auxiliary/c99/kernel_sgetr_lib4.c b/auxiliary/c99/kernel_sgetr_lib4.c
new file mode 100644
index 0000000..4cf6fa2
--- /dev/null
+++ b/auxiliary/c99/kernel_sgetr_lib4.c
@@ -0,0 +1,370 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+ C[2+bs*3] = alpha * A[3+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+ C[3+bs*3] = alpha * A[3+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+
diff --git a/auxiliary/d_aux_ext_dep_lib.c b/auxiliary/d_aux_ext_dep_lib.c
new file mode 100644
index 0000000..c12da10
--- /dev/null
+++ b/auxiliary/d_aux_ext_dep_lib.c
@@ -0,0 +1,632 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void d_zeros(double **pA, int row, int col)
+ {
+ *pA = malloc((row*col)*sizeof(double));
+ double *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void d_zeros_align(double **pA, int row, int col)
+ {
+#if defined(OS_WINDOWS)
+ *pA = (double *) _aligned_malloc( (row*col)*sizeof(double), 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, (row*col)*sizeof(double));
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *pA = temp;
+#endif
+ double *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* frees matrix */
+void d_free(double *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void d_free_align(double *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* prints a matrix in column-major format */
+void d_print_mat(int m, int n, double *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_mat(int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints a matrix in column-major format */
+void d_print_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<row; i++)
+ {
+ for(j=0; j<col; j++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void d_print_e_mat(int m, int n, double *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%1.15e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void d_print_e_tran_mat(int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+ {
+ const int bs = D_PS;
+ int nc = D_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ d_zeros_align(&(sA->pA), sA->pm, sA->cn);
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ d_zeros_align(&(sA->dA), tmp, 1);
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+ {
+ d_free_align(sA->pA);
+ d_free_align(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void d_allocate_strvec(int m, struct d_strvec *sa)
+ {
+ const int bs = D_PS;
+// int nc = D_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ d_zeros_align(&(sa->pa), sa->pm, 1);
+ sa->memory_size = pm*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void d_free_strvec(struct d_strvec *sa)
+ {
+ d_free_align(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = D_PS;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE * file, int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = D_PS;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j]);
+ }
+ fprintf(file, "\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ fprintf(file, "\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = D_PS;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+ {
+ sA->m = m;
+ sA->n = n;
+ d_zeros(&(sA->pA), sA->m, sA->n);
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ d_zeros(&(sA->dA), tmp, 1);
+ sA->memory_size = (m*n+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+ {
+ free(sA->pA);
+ free(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m
+void d_allocate_strvec(int m, struct d_strvec *sa)
+ {
+ sa->m = m;
+ d_zeros(&(sa->pa), sa->m, 1);
+ sa->memory_size = m*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a vector structure
+void d_free_strvec(struct d_strvec *sa)
+ {
+ free(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ d_print_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE *file, int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ d_print_to_file_mat(file, m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void d_print_to_file_tran_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ d_print_e_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/d_aux_lib.c b/auxiliary/d_aux_lib.c
new file mode 100644
index 0000000..6f1f5d1
--- /dev/null
+++ b/auxiliary/d_aux_lib.c
@@ -0,0 +1,982 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+ {
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ int size = (m*n+tmp)*sizeof(double);
+ return size;
+ }
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int d_size_diag_strmat(int m, int n)
+ {
+ int size = 0;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ size = tmp*sizeof(double);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+ {
+ sA->m = m;
+ sA->n = n;
+ double *ptr = (double *) memory;
+ sA->pA = ptr;
+ ptr += m*n;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (m*n+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+ {
+ int size = m*sizeof(double);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+ {
+ sa->m = m;
+ double *ptr = (double *) memory;
+ sa->pa = ptr;
+// ptr += m * n;
+ sa->memory_size = m*sizeof(double);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+ pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+ pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+ pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+ pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+ A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+ A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+ A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+ A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pA[ii+lda*jj] = alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] = alpha*x[ii];
+ return;
+ }
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] += alpha;
+ return;
+ }
+
+
+
+// extract a row into a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*lda];
+ return;
+ }
+
+
+
+// insert a vector into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] = alpha*x[ii];
+ return;
+ }
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] += alpha*x[ii];
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*lda;
+ int ii;
+ double tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii*lda];
+ pA[ii*lda] = pC[ii*ldc];
+ pC[ii*ldc] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = pA[ii];
+ return;
+ }
+
+
+
+// insert a vector into a rcol
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii] = x[ii];
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*lda;
+ int ii;
+ double tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii];
+ pA[ii] = pC[ii];
+ pC[ii] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ pA[ii+1+jj*lda] *= alpha;
+ pA[ii+2+jj*lda] *= alpha;
+ pA[ii+3+jj*lda] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+ pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+ pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = 0;
+ for(; ii<=jj; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*(lda+1)];
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii*(ldd+1)];
+ }
+ return;
+ }
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] += alpha*x[ii];
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*ldd] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] += alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+// clip without mask return
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+ {
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ }
+ else
+ {
+ z[ii] = x[ii];
+ }
+ }
+ return;
+ }
+
+
+
+// clip with mask return
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+ {
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+ double *mask = sm->pa + mi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ mask[ii] = 1.0;
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ mask[ii] = -1.0;
+ }
+ else
+ {
+ z[ii] = x[ii];
+ mask[ii] = 0.0;
+ }
+ }
+ return;
+ }
+
+
+// zero out components using mask
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+ {
+ double *mask = sm->pa + mi;
+ double *v = sv->pa + vi;
+ double *e = se->pa + ei;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(mask[ii]==0)
+ {
+ e[ii] = v[ii];
+ }
+ else
+ {
+ e[ii] = 0;
+ }
+ }
+ return;
+ }
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+ {
+ int ii;
+ double *x = sx->pa + xi;
+ double norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/d_aux_lib4.c b/auxiliary/d_aux_lib4.c
new file mode 100644
index 0000000..152aed1
--- /dev/null
+++ b/auxiliary/d_aux_lib4.c
@@ -0,0 +1,3609 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+// copies a packed matrix into a packed matrix
+// TODO remove alha !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void dgecp_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int offA = offsetA%bs;
+ int offB = offsetB%bs;
+
+ // A at the beginning of the block
+ A -= offA;
+
+ // A at the beginning of the block
+ B -= offB;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_0_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_0_lib4(0, n, alpha, A, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(0, n, alpha, A, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(0, n, alpha, A, B);
+ }
+ }
+ // skip one element of A
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ //A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+2);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_1_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_1_lib4(0, n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+1, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(0, n, alpha, A+1, B);
+ }
+ }
+ // skip 2 elements of A
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B+3);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A, B+2);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_2_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_2_lib4(0, n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+2, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+2, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B);
+ }
+ }
+ // skip 3 elements of A
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_3_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_3_lib4(0, n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+3, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B);
+ }
+ }
+
+ }
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+ {
+
+ if(m<=0)
+ return;
+
+ int n = m;
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int offA = offsetA%bs;
+ int offB = offsetB%bs;
+
+ // A at the beginning of the block
+ A -= offA;
+
+ // A at the beginning of the block
+ B -= offB;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_0_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_0_lib4(1, ii, alpha, A, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A, B);
+ }
+ }
+ // skip one element of A
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ //A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+2);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_1_lib4(1, ii, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+1, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A+1, B);
+ }
+ }
+ // skip 2 elements of A
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B+3);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A, B+2);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_2_lib4(1, ii, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+2, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+2, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B);
+ }
+ }
+ // skip 3 elements of A
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_3_lib4(1, ii, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+3, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B);
+ }
+ }
+
+ }
+
+
+
+// scales and adds a packed matrix into a packed matrix: B = B + alpha*A
+void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int offA = offsetA%bs;
+ int offB = offsetB%bs;
+
+ // A at the beginning of the block
+ A -= offA;
+
+ // A at the beginning of the block
+ B -= offB;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_0_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_0_lib4(n, alpha, A, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A, B);
+ else if(m-ii==2)
+ kernel_dgead_2_0_lib4(n, alpha, A, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_0_lib4(n, alpha, A, B);
+ }
+ }
+ // skip one element of A
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ //A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_3_lib4(n, alpha, A, sda, B+2);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_2_lib4(n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_1_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_1_lib4(n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A+1, B);
+ else if(m-ii==2)
+ kernel_dgead_2_0_lib4(n, alpha, A+1, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_0_lib4(n, alpha, A+1, B);
+ }
+ }
+ // skip 2 elements of A
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgead_2_3_lib4(n, alpha, A, sda, B+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+1, B+3);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A, B+2);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_3_lib4(n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_2_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_2_lib4(n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A+2, B);
+ else if(m-ii==2)
+ kernel_dgead_2_0_lib4(n, alpha, A+2, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_2_lib4(n, alpha, A, sda, B);
+ }
+ }
+ // skip 3 elements of A
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_3_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_3_lib4(n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A+3, B);
+ else if(m-ii==2)
+ kernel_dgead_2_3_lib4(n, alpha, A, sda, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_3_lib4(n, alpha, A, sda, B);
+ }
+ }
+
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A =
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(mna==2)
+ kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else //if(mna==3)
+ kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+ pA += 2*bs*sda;
+ pC += 2*bs*bs;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+// for( ; ii<m; ii+=4)
+ {
+ kernel_dgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose lower triangular matrix
+void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+/*
+
+A =
+ x
+ x x
+ x x x
+ x x x x
+
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+
+ x x x x x x x
+ x x x x x x
+ x x x x x
+ x x x x
+
+ x x x
+ x x
+ x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ pC[0] = alpha * pA[0];
+ }
+ else if(mna==2)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+#if 0 //defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgetr_8_lib4(1, n, nna, alpha, pA, sda, pC, sdc);
+ pA += 2*bs*sda;
+ pC += 2*bs*bs;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_dgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_dgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_dgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+/*
+
+A =
+ x x x x x x x x
+ x x x x x x x
+
+ x x x x x x
+ x x x x x
+ x x x x
+ x x x
+ x x
+ x
+
+C =
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+ int tna = nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ if(nna!=1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1;
+ tna = (bs-(offsetC+1)%bs)%bs;
+ }
+ else //if(nna==1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+1)%bs)%bs;
+ }
+// kernel_dgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+ }
+ else if(mna==2)
+ {
+ if(nna==0 || nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = (bs-(offsetC+2)%bs)%bs;
+ kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+ kernel_dgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3; //(bs-(offsetC+2)%bs)%bs;
+// kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+2)%bs)%bs;
+ kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ tna = 1;
+ kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = 2;
+ kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+ kernel_dgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3;
+// kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else //if(nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ tna = 0;
+ kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+#if 0 //defined(TARGET_X64_AVX2)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+ pA += 2*bs*sda;
+ pC += 2*bs*bs;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ if(tna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[3+bs*0] = alpha * pA[0+bs*3];
+ pC[3+bs*1] = alpha * pA[1+bs*3];
+ pC[3+bs*2] = alpha * pA[2+bs*3];
+ pC[3+bs*3] = alpha * pA[3+bs*3];
+ pA += 4*bs;
+ pC += sdc*bs;
+ kernel_dgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[2+bs*3] = alpha * pA[3+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ kernel_dgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+ }
+ else if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[1+bs*3] = alpha * pA[3+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ kernel_dgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+ }
+ else //if(tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ kernel_dgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+// pC[0+bs*3] = alpha * pA[3+bs*0];
+ pA += bs;
+ pC += 1;
+// kernel_dgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+ }
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ }
+ else if(m-ii==2)
+ {
+ if(tna!=1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ else //if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ }
+ }
+ else if(m-ii==3)
+ {
+ if(tna==0 || tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ }
+ else //if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// regularize diagonal
+void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += reg;
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += reg;
+ pD[jj*sdd+(jj+1)*bs+1] += reg;
+ pD[jj*sdd+(jj+2)*bs+2] += reg;
+ pD[jj*sdd+(jj+3)*bs+3] += reg;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+ }
+
+ }
+
+
+
+// insert sqrt of vector to diagonal
+void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = sqrt(x[ll]);
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+ pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+ pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+ pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+ }
+
+ }
+
+
+
+// extract diagonal to vector
+void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha * pD[ll+bs*ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+ x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+ x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+ x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal
+void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+ pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+ pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+ pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+ }
+
+ }
+
+
+
+// extract diagonal to vector, sparse formulation
+void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row
+void drowin_lib(int kmax, double alpha, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] = alpha*x[jj+0];
+ pD[(jj+1)*bs] = alpha*x[jj+1];
+ pD[(jj+2)*bs] = alpha*x[jj+2];
+ pD[(jj+3)*bs] = alpha*x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// extract row to vector
+void drowex_lib(int kmax, double alpha, double *pD, double *x)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha*pD[(jj+0)*bs];
+ x[jj+1] = alpha*pD[(jj+1)*bs];
+ x[jj+2] = alpha*pD[(jj+2)*bs];
+ x[jj+3] = alpha*pD[(jj+3)*bs];
+ }
+ for(; jj<kmax; jj++)
+ {
+ x[jj] = alpha*pD[(jj)*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to row
+void drowad_lib(int kmax, double alpha, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] += alpha * x[jj+0];
+ pD[(jj+1)*bs] += alpha * x[jj+1];
+ pD[(jj+2)*bs] += alpha * x[jj+2];
+ pD[(jj+3)*bs] += alpha * x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row, sparse formulation
+void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to row, sparse formulation
+void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// swap two rows
+void drowsw_lib(int kmax, double *pA, double *pC)
+ {
+
+ const int bs = 4;
+
+ int ii;
+ double tmp;
+
+ for(ii=0; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[0+bs*1];
+ pA[0+bs*1] = pC[0+bs*1];
+ pC[0+bs*1] = tmp;
+ tmp = pA[0+bs*2];
+ pA[0+bs*2] = pC[0+bs*2];
+ pC[0+bs*2] = tmp;
+ tmp = pA[0+bs*3];
+ pA[0+bs*3] = pC[0+bs*3];
+ pC[0+bs*3] = tmp;
+ pA += 4*bs;
+ pC += 4*bs;
+ }
+ for( ; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1*bs;
+ pC += 1*bs;
+ }
+
+ }
+
+
+
+// extract vector from column
+void dcolex_lib(int kmax, int offset, double *pD, int sdd, double *x)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = pD[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = pD[jj*sdd+0];
+ x[jj+1] = pD[jj*sdd+1];
+ x[jj+2] = pD[jj*sdd+2];
+ x[jj+3] = pD[jj*sdd+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = pD[jj*sdd+ll];
+ }
+
+ }
+
+
+
+// insert vector to column
+void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] = x[jj+0];
+ pD[jj*sdd+1] = x[jj+1];
+ pD[jj*sdd+2] = x[jj+2];
+ pD[jj*sdd+3] = x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] = x[jj+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to column
+void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] += alpha * x[jj+0];
+ pD[jj*sdd+1] += alpha * x[jj+1];
+ pD[jj*sdd+2] += alpha * x[jj+2];
+ pD[jj*sdd+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// swaps two cols
+void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+ const int bs = 4;
+
+ int ii;
+
+ double tmp;
+
+ if(offsetA==offsetC)
+ {
+ if(offsetA>0)
+ {
+ ii = 0;
+ for(; ii<bs-offsetA; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ pA += bs*(sda-1);
+ pC += bs*(sdc-1);
+ kmax -= bs-offsetA;
+ }
+ ii = 0;
+ for(; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[1+bs*0];
+ pA[1+bs*0] = pC[1+bs*0];
+ pC[1+bs*0] = tmp;
+ tmp = pA[2+bs*0];
+ pA[2+bs*0] = pC[2+bs*0];
+ pC[2+bs*0] = tmp;
+ tmp = pA[3+bs*0];
+ pA[3+bs*0] = pC[3+bs*0];
+ pC[3+bs*0] = tmp;
+ pA += bs*sda;
+ pC += bs*sdc;
+ }
+ for(; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ }
+ else
+ {
+ printf("\ndcolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+ exit(1);
+ }
+
+ return;
+
+ }
+
+
+
+// insert vector to vector, sparse formulation
+void dvecin_libsp(int kmax, int *idx, double *x, double *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] = x[jj];
+ }
+
+ }
+
+
+
+// adds vector to vector, sparse formulation
+void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] += alpha * x[jj];
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = D_NC;
+ int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = (pm*cn+tmp)*sizeof(double);
+ return memory_size;
+ }
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int d_size_diag_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = D_NC;
+ int al = bs*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = tmp*sizeof(double);
+ return memory_size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+ {
+ const int bs = 4;
+ int nc = D_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ double *ptr = (double *) memory;
+ sA->pA = ptr;
+ ptr += pm*cn;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+ {
+ const int bs = 4;
+// int nc = D_NC;
+// int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int memory_size = pm*sizeof(double);
+ return memory_size;
+ }
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+ {
+ const int bs = 4;
+// int nc = D_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ double *ptr = (double *) memory;
+ sa->pa = ptr;
+// ptr += pm;
+ sa->memory_size = pm*sizeof(double);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, jj, m0, m1, m2;
+ double *B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m256d
+ tmp;
+#endif
+ m0 = (bs-ai%bs)%bs;
+ if(m0>m)
+ m0 = m;
+ m1 = m - m0;
+ jj = 0;
+ for( ; jj<n-3; jj+=4)
+ {
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ pB[ii+bs*1] = B[ii+lda*1];
+ pB[ii+bs*2] = B[ii+lda*2];
+ pB[ii+bs*3] = B[ii+lda*3];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; ii<m-3; ii+=4)
+ {
+ tmp = _mm256_loadu_pd( &B[0+lda*0] );
+ _mm256_store_pd( &pB[0+bs*0], tmp );
+ tmp = _mm256_loadu_pd( &B[0+lda*1] );
+ _mm256_store_pd( &pB[0+bs*1], tmp );
+ tmp = _mm256_loadu_pd( &B[0+lda*2] );
+ _mm256_store_pd( &pB[0+bs*2], tmp );
+ tmp = _mm256_loadu_pd( &B[0+lda*3] );
+ _mm256_store_pd( &pB[0+bs*3], tmp );
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ pB[1+bs*1] = B[1+lda*1];
+ pB[2+bs*1] = B[2+lda*1];
+ pB[3+bs*1] = B[3+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ pB[1+bs*2] = B[1+lda*2];
+ pB[2+bs*2] = B[2+lda*2];
+ pB[3+bs*2] = B[3+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ pB[1+bs*3] = B[1+lda*3];
+ pB[2+bs*3] = B[2+lda*3];
+ pB[3+bs*3] = B[3+lda*3];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+#endif
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ for( ; jj<n; jj++)
+ {
+
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, m0, m1, m2;
+ double *B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m256d
+ v0, v1, v2, v3,
+ v4, v5, v6, v7;
+#endif
+ m0 = (bs-ai%bs)%bs;
+ if(m0>n)
+ m0 = n;
+ m1 = n - m0;
+ ii = 0;
+ if(m0>0)
+ {
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m0; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ A += m0*lda;
+ pA += m0 + bs*(sda-1);
+ }
+ ii = 0;
+ for(; ii<m1-3; ii+=bs)
+ {
+ j=0;
+ B = A + ii*lda;
+ pB = pA + ii*sda;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; j<m-3; j+=4)
+ {
+ v0 = _mm256_loadu_pd( &B[0+0*lda] ); // 00 10 20 30
+ v1 = _mm256_loadu_pd( &B[0+1*lda] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_loadu_pd( &B[0+2*lda] ); // 02 12 22 32
+ v3 = _mm256_loadu_pd( &B[0+3*lda] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ B += 4;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &pB[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &pB[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &pB[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &pB[0+bs*3], v3 );
+
+ pB += 4*bs;
+ }
+#else
+ for(; j<m-3; j+=4)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ // unroll 1
+ pB[0+1*bs] = B[1+0*lda];
+ pB[1+1*bs] = B[1+1*lda];
+ pB[2+1*bs] = B[1+2*lda];
+ pB[3+1*bs] = B[1+3*lda];
+ // unroll 2
+ pB[0+2*bs] = B[2+0*lda];
+ pB[1+2*bs] = B[2+1*lda];
+ pB[2+2*bs] = B[2+2*lda];
+ pB[3+2*bs] = B[2+3*lda];
+ // unroll 3
+ pB[0+3*bs] = B[3+0*lda];
+ pB[1+3*bs] = B[3+1*lda];
+ pB[2+3*bs] = B[3+2*lda];
+ pB[3+3*bs] = B[3+3*lda];
+ B += 4;
+ pB += 4*bs;
+ }
+#endif
+ for(; j<m; j++)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ B += 1;
+ pB += 1*bs;
+ }
+ }
+ if(ii<m1)
+ {
+ m2 = m1-ii;
+ if(bs<m2) m2 = bs;
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m2; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ double *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ // unroll 0
+ A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+ A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+ A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+ // unroll 0
+ A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+ A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+ A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+ // unroll 0
+ A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+ A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+ A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ A[0+ii+lda*jj] = ptr_pA[0];
+ A[1+ii+lda*jj] = ptr_pA[1];
+ A[2+ii+lda*jj] = ptr_pA[2];
+ A[3+ii+lda*jj] = ptr_pA[3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ double *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+ A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+ A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+ A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+ // unroll 1
+ A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+ A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+ A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+ A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+ // unroll 2
+ A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+ A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+ A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+ A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+ // unroll 3
+ A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+ A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+ A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+ A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ i=0;
+ for(; i<bs; i++)
+ {
+ A[jj+lda*(i+ii)] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+ int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+ int ii, jj;
+ if(m0>0)
+ {
+ for(ii=0; ii<m0; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ m -= m0;
+ }
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[0+jj*bs] = alpha;
+ pA[1+jj*bs] = alpha;
+ pA[2+jj*bs] = alpha;
+ pA[3+jj*bs] = alpha;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pA[ll+bs*ll] = alpha*x[ll];
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
+ pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
+ pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
+ pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+ }
+ return;
+ }
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pA[ll+bs*ll] += alpha;
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pA[jj*sda+(jj+0)*bs+0] += alpha;
+ pA[jj*sda+(jj+1)*bs+1] += alpha;
+ pA[jj*sda+(jj+2)*bs+2] += alpha;
+ pA[jj*sda+(jj+3)*bs+3] += alpha;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pA[jj*sda+(jj+ll)*bs+ll] += alpha;
+ }
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ drowsw_lib(kmax, pA, pC);
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+// extract a row int a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ drowex_lib(kmax, alpha, pA, x);
+ return;
+ }
+
+
+
+// insert a vector into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ drowin_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ drowad_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ dcolex_lib(kmax, ai%bs, pA, sda, x);
+ return;
+ }
+
+
+
+
+// insert as vector as a column
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ dcolin_lib(kmax, x, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dcolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dgecp_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ dgecp_lib(m, n, alpha, ai%bs, pA, sda, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dtrcp_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dgead_lib(m, n, alpha, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dtrtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dtrtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha*pA[ll+bs*ll];
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha*pA[jj*sda+(jj+0)*bs+0];
+ x[jj+1] = alpha*pA[jj*sda+(jj+1)*bs+1];
+ x[jj+2] = alpha*pA[jj*sda+(jj+2)*bs+2];
+ x[jj+3] = alpha*pA[jj*sda+(jj+3)*bs+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha*pA[jj*sda+(jj+ll)*bs+ll];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+ }
+ return;
+ }
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pA[ll+bs*ll] += alpha*x[ll];
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pA[jj*sda+(jj+0)*bs+0] += alpha*x[jj+0];
+ pA[jj*sda+(jj+1)*bs+1] += alpha*x[jj+1];
+ pA[jj*sda+(jj+2)*bs+2] += alpha*x[jj+2];
+ pA[jj*sda+(jj+3)*bs+3] += alpha*x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pA[jj*sda+(jj+ll)*bs+ll] += alpha*x[jj+ll];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+ drowad_libsp(kmax, idx, alpha, x, pD);
+ return;
+ }
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] += alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+ {
+
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ double d0;
+
+ __m256d
+ xm0, x0, xp0, z0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+ mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+ mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ _mm256_storeu_pd( &z[ii], z0 );
+ }
+ if(ii<m)
+ {
+ d0 = (double) m-ii;
+ mask2 = _mm256_broadcast_sd( &d0 );
+ mask2 = _mm256_sub_pd( mask1, mask2 );
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ _mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+ }
+#else
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ }
+ else
+ {
+ z[ii] = x[ii];
+ }
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+ {
+
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+ double *mask = sm->pa + mi;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ double d0;
+
+ __m256d
+ xm0, x0, xp0, z0, mask0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+ mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+ mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ mask0 = _mm256_setzero_pd();
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+ mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+ _mm256_storeu_pd( &z[ii], z0 );
+ _mm256_storeu_pd( &mask[ii], mask0 );
+ }
+ if(ii<m)
+ {
+ d0 = (double) m-ii;
+ mask2 = _mm256_broadcast_sd( &d0 );
+ mask2 = _mm256_sub_pd( mask1, mask2 );
+ mask0 = _mm256_setzero_pd();
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+ mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+ _mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+ _mm256_maskstore_pd( &mask[ii], _mm256_castpd_si256( mask2 ), mask0 );
+ }
+#else
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ mask[ii] = 1.0;
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ mask[ii] = -1.0;
+ }
+ else
+ {
+ z[ii] = x[ii];
+ mask[ii] = 0.0;
+ }
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+ {
+ double *mask = sm->pa + mi;
+ double *v = sv->pa + vi;
+ double *e = se->pa + ei;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ double d0;
+
+ __m256d
+ mask0, mask1, mask2, mask3, fives, zeros, e0, v0;
+
+ fives = _mm256_set_pd( 0.5, 0.5, 0.5, 0.5 );
+ zeros = _mm256_setzero_pd();
+ mask3 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ v0 = _mm256_loadu_pd( &v[ii] );
+ mask0 = _mm256_loadu_pd( &mask[ii] );
+ mask1 = mask0;
+ mask0 = _mm256_sub_pd( mask0, fives);
+ mask1 = _mm256_add_pd( mask1, fives);
+ mask0 = _mm256_xor_pd( mask0, mask1);
+ e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+ _mm256_storeu_pd( &e[ii], e0 );
+ }
+ if(ii<m)
+ {
+ d0 = (double) m-ii;
+ mask2 = _mm256_broadcast_sd( &d0 );
+ mask2 = _mm256_sub_pd( mask3, mask2 );
+ v0 = _mm256_loadu_pd( &v[ii] );
+ mask0 = _mm256_loadu_pd( &mask[ii] );
+ mask1 = mask0;
+ mask0 = _mm256_sub_pd( mask0, fives);
+ mask1 = _mm256_add_pd( mask1, fives);
+ mask0 = _mm256_xor_pd( mask0, mask1);
+ e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+ _mm256_maskstore_pd( &e[ii], _mm256_castpd_si256( mask2 ), e0 );
+ }
+#else
+ for(ii=0; ii<m; ii++)
+ {
+ if(mask[ii]==0)
+ {
+ e[ii] = v[ii];
+ }
+ else
+ {
+ e[ii] = 0;
+ }
+ }
+#endif
+
+ }
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+ {
+ int ii;
+ double *x = sx->pa + xi;
+ double norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/i_aux_ext_dep_lib.c b/auxiliary/i_aux_ext_dep_lib.c
new file mode 100644
index 0000000..1ca2292
--- /dev/null
+++ b/auxiliary/i_aux_ext_dep_lib.c
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix aligned */
+void int_zeros(int **pA, int row, int col)
+ {
+ void *temp = malloc((row*col)*sizeof(int));
+ *pA = temp;
+ int *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void int_zeros_align(int **pA, int row, int col)
+ {
+#if defined(OS_WINDOWS)
+ *pA = (int *) _aligned_malloc( (row*col)*sizeof(int), 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, (row*col)*sizeof(int));
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *pA = temp;
+#endif
+ int *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* frees matrix */
+void int_free(int *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void int_free_align(int *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* prints a matrix in column-major format */
+void int_print_mat(int row, int col, int *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<row; i++)
+ {
+ for(j=0; j<col; j++)
+ {
+ printf("%d ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
diff --git a/auxiliary/m_aux_lib.c b/auxiliary/m_aux_lib.c
new file mode 100644
index 0000000..30cb333
--- /dev/null
+++ b/auxiliary/m_aux_lib.c
@@ -0,0 +1,112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ ps[ii] = (float) pd[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ pd[ii] = (double) ps[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+ {
+ int lda = Md->m;
+ int ldb = Ms->m;
+ double *pA = Md->pA+mid+nid*lda;
+ float *pB = Ms->pA+mis+nis*ldb;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pB[ii+jj*ldb] = (float) pA[ii+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+ {
+ int lda = Ms->m;
+ int ldb = Md->m;
+ float *pA = Ms->pA+mis+nis*lda;
+ double *pB = Md->pA+mid+nid*ldb;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pB[ii+jj*ldb] = (double) pA[ii+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/m_aux_lib44.c b/auxiliary/m_aux_lib44.c
new file mode 100644
index 0000000..a17d545
--- /dev/null
+++ b/auxiliary/m_aux_lib44.c
@@ -0,0 +1,93 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ ps[ii] = (float) pd[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ pd[ii] = (double) ps[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+ {
+ printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+ exit(1);
+ return;
+ }
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+ {
+ printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+ exit(1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/m_aux_lib48.c b/auxiliary/m_aux_lib48.c
new file mode 100644
index 0000000..e9fdcd2
--- /dev/null
+++ b/auxiliary/m_aux_lib48.c
@@ -0,0 +1,153 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ ps[ii] = (float) pd[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ pd[ii] = (double) ps[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+ {
+// printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+// exit(1);
+ if(mid!=0 | mis!=0)
+ {
+ printf("\nm_cvt_d2s_strmat: feature not implmeneted yet: mid=%d, mis=%d\n\n", mid, mis);
+ exit(1);
+ }
+ const int psd = 4;
+ const int pss = 8;
+ const int sdd = Md->cn;
+ double *D0 = Md->pA + nid*psd;
+ double *D1;
+ const int sds = Ms->cn;
+ float *S = Ms->pA + nis*pss;
+ int ii, jj, ll;
+ for(ii=0; ii<m-7; ii+=8)
+ {
+ D1 = D0 + psd*sdd;
+ for(jj=0; jj<n; jj++)
+ {
+ S[0+jj*pss] = (float) D0[0+jj*psd];
+ S[1+jj*pss] = (float) D0[1+jj*psd];
+ S[2+jj*pss] = (float) D0[2+jj*psd];
+ S[3+jj*pss] = (float) D0[3+jj*psd];
+ S[4+jj*pss] = (float) D1[0+jj*psd];
+ S[5+jj*pss] = (float) D1[1+jj*psd];
+ S[6+jj*pss] = (float) D1[2+jj*psd];
+ S[7+jj*pss] = (float) D1[3+jj*psd];
+ }
+ D0 += 8*sdd;
+ S += 8*sds;
+ }
+ if(m-ii>0)
+ {
+ if(m-ii<4)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ll=0; ll<m-ii; ll++)
+ {
+ S[ll+jj*pss] = (float) D0[ll+jj*psd];
+ }
+ }
+ return;
+ }
+ else
+ {
+ D1 = D0 + psd*sdd;
+ for(jj=0; jj<n; jj++)
+ {
+ S[0+jj*pss] = (float) D0[0+jj*psd];
+ S[1+jj*pss] = (float) D0[1+jj*psd];
+ S[2+jj*pss] = (float) D0[2+jj*psd];
+ S[3+jj*pss] = (float) D0[3+jj*psd];
+ for(ll=0; ll<m-ii-4; ll++)
+ {
+ S[4+ll+jj*pss] = (float) D1[ll+jj*psd];
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+ {
+ printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+ exit(1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/s_aux_ext_dep_lib.c b/auxiliary/s_aux_ext_dep_lib.c
new file mode 100644
index 0000000..85f7ebc
--- /dev/null
+++ b/auxiliary/s_aux_ext_dep_lib.c
@@ -0,0 +1,633 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void s_zeros(float **pA, int row, int col)
+ {
+ *pA = malloc((row*col)*sizeof(float));
+ float *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void s_zeros_align(float **pA, int row, int col)
+ {
+#if defined(OS_WINDOWS)
+ *pA = (float *) _aligned_malloc( (row*col)*sizeof(float), 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, (row*col)*sizeof(float));
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *pA = temp;
+#endif
+ float *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* frees matrix */
+void s_free(float *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void s_free_align(float *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* prints a matrix in column-major format */
+void s_print_mat(int m, int n, float *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_mat(int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints a matrix in column-major format */
+void s_print_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<row; i++)
+ {
+ for(j=0; j<col; j++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void s_print_e_mat(int m, int n, float *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void s_print_e_tran_mat(int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+ {
+ const int bs = S_PS;
+ int nc = S_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ s_zeros_align(&(sA->pA), sA->pm, sA->cn);
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ s_zeros_align(&(sA->dA), tmp, 1);
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+ {
+ s_free_align(sA->pA);
+ s_free_align(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void s_allocate_strvec(int m, struct s_strvec *sa)
+ {
+ const int bs = S_PS;
+// int nc = S_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ s_zeros_align(&(sa->pa), sa->pm, 1);
+ sa->memory_size = pm*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void s_free_strvec(struct s_strvec *sa)
+ {
+ s_free_align(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = S_PS;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE * file, int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = S_PS;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j]);
+ }
+ fprintf(file, "\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ fprintf(file, "\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = S_PS;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+ {
+ sA->m = m;
+ sA->n = n;
+ s_zeros(&(sA->pA), sA->m, sA->n);
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ s_zeros(&(sA->dA), tmp, 1);
+ sA->memory_size = (m*n+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+ {
+ free(sA->pA);
+ free(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m
+void s_allocate_strvec(int m, struct s_strvec *sa)
+ {
+ sa->m = m;
+ s_zeros(&(sa->pa), sa->m, 1);
+ sa->memory_size = m*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a vector structure
+void s_free_strvec(struct s_strvec *sa)
+ {
+ free(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ s_print_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE *file, int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ s_print_to_file_mat(file, m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void s_print_to_file_tran_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ s_print_e_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib.c b/auxiliary/s_aux_lib.c
new file mode 100644
index 0000000..978eb9a
--- /dev/null
+++ b/auxiliary/s_aux_lib.c
@@ -0,0 +1,956 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+ {
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ int size = (m*n+tmp)*sizeof(float);
+ return size;
+ }
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int s_size_diag_strmat(int m, int n)
+ {
+ int size = 0;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ size = tmp*sizeof(float);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+ {
+ sA->m = m;
+ sA->n = n;
+ float *ptr = (float *) memory;
+ sA->pA = ptr;
+ ptr += m*n;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (m*n+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+ {
+ int size = m*sizeof(float);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+ {
+ sa->m = m;
+ float *ptr = (float *) memory;
+ sa->pa = ptr;
+// ptr += m * n;
+ sa->memory_size = m*sizeof(float);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+ pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+ pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+ pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+ pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+ A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+ A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+ A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+ A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pA[ii+lda*jj] = alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*(lda+1)];
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] = alpha*x[ii];
+ return;
+ }
+
+
+
+// extract a row into a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*lda];
+ return;
+ }
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] = alpha*x[ii];
+ return;
+ }
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] += alpha*x[ii];
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*lda;
+ int ii;
+ float tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii*lda];
+ pA[ii*lda] = pC[ii*ldc];
+ pC[ii*ldc] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+
+// insert a vector into a rcol
+void scolin_libstr(int kmax, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii] = x[ii];
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*lda;
+ int ii;
+ float tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii];
+ pA[ii] = pC[ii];
+ pC[ii] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ pA[ii+1+jj*lda] *= alpha;
+ pA[ii+2+jj*lda] *= alpha;
+ pA[ii+3+jj*lda] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+ pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+ pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = 0;
+ for(; ii<=jj; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii*(ldd+1)];
+ }
+ return;
+ }
+
+
+
+// add a vector to diagonal
+void sdiaad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] += alpha*x[ii];
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*ldd] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+
+void svecad_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] += alpha * x[ii];
+ return;
+ }
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+// clip without mask return
+void sveccl_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi)
+ {
+ float *xm = sxm->pa + xim;
+ float *x = sx->pa + xi;
+ float *xp = sxp->pa + xip;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ }
+ else
+ {
+ z[ii] = x[ii];
+ }
+ }
+ return;
+ }
+
+
+
+// clip with mask return
+void sveccl_mask_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi, struct s_strvec *sm, int mi)
+ {
+ float *xm = sxm->pa + xim;
+ float *x = sx->pa + xi;
+ float *xp = sxp->pa + xip;
+ float *z = sz->pa + zi;
+ float *mask = sm->pa + mi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ mask[ii] = 1.0;
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ mask[ii] = -1.0;
+ }
+ else
+ {
+ z[ii] = x[ii];
+ mask[ii] = 0.0;
+ }
+ }
+ return;
+ }
+
+
+// zero out components using mask
+void svecze_libstr(int m, struct s_strvec *sm, int mi, struct s_strvec *sv, int vi, struct s_strvec *se, int ei)
+ {
+ float *mask = sm->pa + mi;
+ float *v = sv->pa + vi;
+ float *e = se->pa + ei;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(mask[ii]==0)
+ {
+ e[ii] = v[ii];
+ }
+ else
+ {
+ e[ii] = 0;
+ }
+ }
+ return;
+ }
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+ {
+ int ii;
+ float *x = sx->pa + xi;
+ float norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/s_aux_lib4.c b/auxiliary/s_aux_lib4.c
new file mode 100644
index 0000000..12acc47
--- /dev/null
+++ b/auxiliary/s_aux_lib4.c
@@ -0,0 +1,3107 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float *alphap, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float alpha = alphap[0];
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A =
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(mna==2)
+ kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else //if(mna==3)
+ kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+ for( ; ii<m-3; ii+=4)
+// for( ; ii<m; ii+=4)
+ {
+ kernel_sgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+/*
+
+A =
+ x
+ x x
+ x x x
+ x x x x
+
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+
+ x x x x x x x
+ x x x x x x
+ x x x x x
+ x x x x
+
+ x x x
+ x x
+ x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ pC[0] = alpha * pA[0];
+ }
+ else if(mna==2)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_sgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_sgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_sgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+/*
+
+A =
+ x x x x x x x x
+ x x x x x x x
+
+ x x x x x x
+ x x x x x
+ x x x x
+ x x x
+ x x
+ x
+
+C =
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+ int tna = nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ if(nna!=1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1;
+ tna = (bs-(offsetC+1)%bs)%bs;
+ }
+ else //if(nna==1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+1)%bs)%bs;
+ }
+// kernel_sgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+ }
+ else if(mna==2)
+ {
+ if(nna==0 || nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = (bs-(offsetC+2)%bs)%bs;
+ kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+ kernel_sgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3; //(bs-(offsetC+2)%bs)%bs;
+// kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+2)%bs)%bs;
+ kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ tna = 1;
+ kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = 2;
+ kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+ kernel_sgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3;
+// kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else //if(nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ tna = 0;
+ kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ if(tna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[3+bs*0] = alpha * pA[0+bs*3];
+ pC[3+bs*1] = alpha * pA[1+bs*3];
+ pC[3+bs*2] = alpha * pA[2+bs*3];
+ pC[3+bs*3] = alpha * pA[3+bs*3];
+ pA += 4*bs;
+ pC += sdc*bs;
+ kernel_sgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[2+bs*3] = alpha * pA[3+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ kernel_sgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+ }
+ else if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[1+bs*3] = alpha * pA[3+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ kernel_sgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+ }
+ else //if(tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ kernel_sgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+// pC[0+bs*3] = alpha * pA[3+bs*0];
+ pA += bs;
+ pC += 1;
+// kernel_sgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+ }
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ }
+ else if(m-ii==2)
+ {
+ if(tna!=1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ else //if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ }
+ }
+ else if(m-ii==3)
+ {
+ if(tna==0 || tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ }
+ else //if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// regularize diagonal
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += reg;
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += reg;
+ pD[jj*sdd+(jj+1)*bs+1] += reg;
+ pD[jj*sdd+(jj+2)*bs+2] += reg;
+ pD[jj*sdd+(jj+3)*bs+3] += reg;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+ }
+
+ }
+
+
+
+// insert vector to diagonal
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = alpha*x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] = alpha*x[jj+0];
+ pD[jj*sdd+(jj+1)*bs+1] = alpha*x[jj+1];
+ pD[jj*sdd+(jj+2)*bs+2] = alpha*x[jj+2];
+ pD[jj*sdd+(jj+3)*bs+3] = alpha*x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+ }
+
+ }
+
+
+
+// insert sqrt of vector to diagonal
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = sqrt(x[ll]);
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+ pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+ pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+ pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+ }
+
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha * pD[ll+bs*ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+ x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+ x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+ x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+ pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+ pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+ pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+ }
+
+ }
+
+
+
+// extract diagonal to vector, sparse formulation
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] = alpha*x[jj+0];
+ pD[(jj+1)*bs] = alpha*x[jj+1];
+ pD[(jj+2)*bs] = alpha*x[jj+2];
+ pD[(jj+3)*bs] = alpha*x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha*pD[(jj+0)*bs];
+ x[jj+1] = alpha*pD[(jj+1)*bs];
+ x[jj+2] = alpha*pD[(jj+2)*bs];
+ x[jj+3] = alpha*pD[(jj+3)*bs];
+ }
+ for(; jj<kmax; jj++)
+ {
+ x[jj] = alpha*pD[(jj)*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to row
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] += alpha * x[jj+0];
+ pD[(jj+1)*bs] += alpha * x[jj+1];
+ pD[(jj+2)*bs] += alpha * x[jj+2];
+ pD[(jj+3)*bs] += alpha * x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row, sparse formulation
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to row, sparse formulation
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+ {
+
+ const int bs = 4;
+
+ int ii;
+ float tmp;
+
+ for(ii=0; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[0+bs*1];
+ pA[0+bs*1] = pC[0+bs*1];
+ pC[0+bs*1] = tmp;
+ tmp = pA[0+bs*2];
+ pA[0+bs*2] = pC[0+bs*2];
+ pC[0+bs*2] = tmp;
+ tmp = pA[0+bs*3];
+ pA[0+bs*3] = pC[0+bs*3];
+ pC[0+bs*3] = tmp;
+ pA += 4*bs;
+ pC += 4*bs;
+ }
+ for( ; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1*bs;
+ pC += 1*bs;
+ }
+
+ }
+
+
+
+// insert vector to column
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] = x[jj+0];
+ pD[jj*sdd+1] = x[jj+1];
+ pD[jj*sdd+2] = x[jj+2];
+ pD[jj*sdd+3] = x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] = x[jj+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to column
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] += alpha * x[jj+0];
+ pD[jj*sdd+1] += alpha * x[jj+1];
+ pD[jj*sdd+2] += alpha * x[jj+2];
+ pD[jj*sdd+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+ const int bs = 4;
+
+ int ii;
+
+ float tmp;
+
+ if(offsetA==offsetC)
+ {
+ if(offsetA>0)
+ {
+ ii = 0;
+ for(; ii<bs-offsetA; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ pA += bs*(sda-1);
+ pC += bs*(sdc-1);
+ kmax -= bs-offsetA;
+ }
+ ii = 0;
+ for(; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[1+bs*0];
+ pA[1+bs*0] = pC[1+bs*0];
+ pC[1+bs*0] = tmp;
+ tmp = pA[2+bs*0];
+ pA[2+bs*0] = pC[2+bs*0];
+ pC[2+bs*0] = tmp;
+ tmp = pA[3+bs*0];
+ pA[3+bs*0] = pC[3+bs*0];
+ pC[3+bs*0] = tmp;
+ pA += bs*sda;
+ pC += bs*sdc;
+ }
+ for(; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ }
+ else
+ {
+ printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+ exit(1);
+ }
+
+ return;
+
+ }
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] = x[jj];
+ }
+
+ }
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] += alpha * x[jj];
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = S_NC;
+ int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = (pm*cn+tmp)*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = S_NC;
+ int al = bs*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = tmp*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+ {
+ const int bs = 4;
+ int nc = S_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ float *ptr = (float *) memory;
+ sA->pA = ptr;
+ ptr += pm*cn;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+ {
+ const int bs = 4;
+// int nc = S_NC;
+// int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int memory_size = pm*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+ {
+ const int bs = 4;
+// int nc = S_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ float *ptr = (float *) memory;
+ sa->pa = ptr;
+// ptr += pm;
+ sa->memory_size = pm*sizeof(float);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, jj, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>m)
+ m0 = m;
+ m1 = m - m0;
+ jj = 0;
+ for( ; jj<n-3; jj+=4)
+ {
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ pB[ii+bs*1] = B[ii+lda*1];
+ pB[ii+bs*2] = B[ii+lda*2];
+ pB[ii+bs*3] = B[ii+lda*3];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ pB[1+bs*1] = B[1+lda*1];
+ pB[2+bs*1] = B[2+lda*1];
+ pB[3+bs*1] = B[3+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ pB[1+bs*2] = B[1+lda*2];
+ pB[2+bs*2] = B[2+lda*2];
+ pB[3+bs*2] = B[3+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ pB[1+bs*3] = B[1+lda*3];
+ pB[2+bs*3] = B[2+lda*3];
+ pB[3+bs*3] = B[3+lda*3];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ for( ; jj<n; jj++)
+ {
+
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>n)
+ m0 = n;
+ m1 = n - m0;
+ ii = 0;
+ if(m0>0)
+ {
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m0; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ A += m0*lda;
+ pA += m0 + bs*(sda-1);
+ }
+ ii = 0;
+ for(; ii<m1-3; ii+=bs)
+ {
+ j=0;
+ B = A + ii*lda;
+ pB = pA + ii*sda;
+ for(; j<m-3; j+=4)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ // unroll 1
+ pB[0+1*bs] = B[1+0*lda];
+ pB[1+1*bs] = B[1+1*lda];
+ pB[2+1*bs] = B[1+2*lda];
+ pB[3+1*bs] = B[1+3*lda];
+ // unroll 2
+ pB[0+2*bs] = B[2+0*lda];
+ pB[1+2*bs] = B[2+1*lda];
+ pB[2+2*bs] = B[2+2*lda];
+ pB[3+2*bs] = B[2+3*lda];
+ // unroll 3
+ pB[0+3*bs] = B[3+0*lda];
+ pB[1+3*bs] = B[3+1*lda];
+ pB[2+3*bs] = B[3+2*lda];
+ pB[3+3*bs] = B[3+3*lda];
+ B += 4;
+ pB += 4*bs;
+ }
+ for(; j<m; j++)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ B += 1;
+ pB += 1*bs;
+ }
+ }
+ if(ii<m1)
+ {
+ m2 = m1-ii;
+ if(bs<m2) m2 = bs;
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m2; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ // unroll 0
+ A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+ A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+ A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+ // unroll 0
+ A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+ A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+ A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+ // unroll 0
+ A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+ A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+ A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ A[0+ii+lda*jj] = ptr_pA[0];
+ A[1+ii+lda*jj] = ptr_pA[1];
+ A[2+ii+lda*jj] = ptr_pA[2];
+ A[3+ii+lda*jj] = ptr_pA[3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+ A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+ A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+ A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+ // unroll 1
+ A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+ A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+ A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+ A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+ // unroll 2
+ A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+ A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+ A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+ A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+ // unroll 3
+ A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+ A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+ A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+ A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ i=0;
+ for(; i<bs; i++)
+ {
+ A[jj+lda*(i+ii)] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+ int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+ int ii, jj;
+ if(m0>0)
+ {
+ for(ii=0; ii<m0; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ m -= m0;
+ }
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[0+jj*bs] = alpha;
+ pA[1+jj*bs] = alpha;
+ pA[2+jj*bs] = alpha;
+ pA[3+jj*bs] = alpha;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ srowsw_lib(kmax, pA, pC);
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowex_lib(kmax, alpha, pA, x);
+ return;
+ }
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowin_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowad_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ int offA = ai%bs;
+
+ // same alignment
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offA)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+ pA += 4*sda;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+ pA += 4*sda;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgesc_3_lib4(n, &alpha, pA+offA);
+ pA += 4*sda;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgesc_4_lib4(n, &alpha, pA);
+ pA += 4*sda;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgesc_1_lib4(n, &alpha, pA);
+ else if(m-ii==2)
+ kernel_sgesc_2_lib4(n, &alpha, pA);
+ else // if(m-ii==3)
+ kernel_sgesc_3_lib4(n, &alpha, pA);
+ }
+
+ return;
+
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: m x n
+ if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offA = ai%bs;
+ int offB = bi%bs;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_0_lib4(n, pA, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_0_lib4(n, pA, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_0_lib4(n, pA, pB);
+ }
+ }
+ // skip one element of pA
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ //pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_3_lib4(n, pA, sda, pB+2);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_2_lib4(n, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_1_lib4(n, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA+1, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_0_lib4(n, pA+1, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_0_lib4(n, pA+1, pB);
+ }
+ }
+ // skip 2 elements of pA
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_3_lib4(n, pA, sda, pB+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+1, pB+3);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_0_lib4(n, pA, pB+2);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_3_lib4(n, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_2_lib4(n, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA+2, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_0_lib4(n, pA+2, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_2_lib4(n, pA, sda, pB);
+ }
+ }
+ // skip 3 elements of pA
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_3_lib4(n, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA+3, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_3_lib4(n, pA, sda, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_3_lib4(n, pA, sda, pB);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offA = ai%bs;
+ int offB = bi%bs;
+
+ int ii, mna;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_0_lib4(ii, pA, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_0_lib4(ii, pA, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_0_lib4(ii, pA, pB);
+ }
+ }
+ // skip one element of pA
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ //pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+2);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_2_lib4(ii, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_1_lib4(ii, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA+1, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_0_lib4(ii, pA+1, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_0_lib4(ii, pA+1, pB);
+ }
+ }
+ // skip 2 elements of pA
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+1, pB+3);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA, pB+2);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_3_lib4(ii, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_2_lib4(ii, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA+2, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_0_lib4(ii, pA+2, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_2_lib4(ii, pA, sda, pB);
+ }
+ }
+ // skip 3 elements of pA
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_3_lib4(ii, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA+3, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_3_lib4(ii, pA, sda, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_3_lib4(ii, pA, sda, pB);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offA = ai%bs;
+ int offB = bi%bs;
+
+ int ii, mna;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_0_lib4(n, &alpha, pA, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_0_lib4(n, &alpha, pA, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_0_lib4(n, &alpha, pA, pB);
+ }
+ }
+ // skip one element of pA
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ //pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+2);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_1_lib4(n, &alpha, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_0_lib4(n, &alpha, pA+1, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_0_lib4(n, &alpha, pA+1, pB);
+ }
+ }
+ // skip 2 elements of pA
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB+3);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA, pB+2);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_2_lib4(n, &alpha, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA+2, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_0_lib4(n, &alpha, pA+2, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB);
+ }
+ }
+ // skip 3 elements of pA
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_3_lib4(n, &alpha, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA+3, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ sgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+ srowad_libsp(kmax, idx, alpha, x, pD);
+ return;
+ }
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ svecad_libsp(kmax, idx, alpha, x, y);
+ return;
+ }
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+ {
+ int ii;
+ float *x = sx->pa + xi;
+ float norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib8.c b/auxiliary/s_aux_lib8.c
new file mode 100644
index 0000000..94ba22d
--- /dev/null
+++ b/auxiliary/s_aux_lib8.c
@@ -0,0 +1,2647 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void strcp_l_lib(int m, int offsetA, float *A, int sda, int offsetB, float *B, int sdb)
+ {
+ printf("\nstrcp_;l_lib: feature not implemented yet\n");
+ exit(1);
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+ printf("\nstrtr_l_lib: feature not implemented yet\n");
+ exit(1);
+ }
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+ printf("\nstrtr_u_lib: feature not implemented yet\n");
+ exit(1);
+ }
+
+
+
+// regularize diagonal
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += reg;
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+0*bs] += reg;
+ pD2[1+1*bs] += reg;
+ pD2[2+2*bs] += reg;
+ pD2[3+3*bs] += reg;
+ pD2[4+4*bs] += reg;
+ pD2[5+5*bs] += reg;
+ pD2[6+6*bs] += reg;
+ pD2[7+7*bs] += reg;
+ pD2 += bs*sdd+bs*bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+ }
+
+ }
+
+
+
+// insert vector to diagonal
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = alpha*x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+bs*0] = alpha*x2[0];
+ pD2[1+bs*1] = alpha*x2[1];
+ pD2[2+bs*2] = alpha*x2[2];
+ pD2[3+bs*3] = alpha*x2[3];
+ pD2[4+bs*4] = alpha*x2[4];
+ pD2[5+bs*5] = alpha*x2[5];
+ pD2[6+bs*6] = alpha*x2[6];
+ pD2[7+bs*7] = alpha*x2[7];
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+ }
+
+ }
+
+
+
+// insert sqrt of vector to diagonal
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = sqrt(x[ll]);
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+bs*0] = sqrt(x2[0]);
+ pD2[1+bs*1] = sqrt(x2[1]);
+ pD2[2+bs*2] = sqrt(x2[2]);
+ pD2[3+bs*3] = sqrt(x2[3]);
+ pD2[4+bs*4] = sqrt(x2[4]);
+ pD2[5+bs*5] = sqrt(x2[5]);
+ pD2[5+bs*6] = sqrt(x2[6]);
+ pD2[7+bs*7] = sqrt(x2[7]);
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+ }
+
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha * pD[ll+bs*ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ x2[0] = alpha * pD2[0+bs*0];
+ x2[1] = alpha * pD2[1+bs*1];
+ x2[2] = alpha * pD2[2+bs*2];
+ x2[3] = alpha * pD2[3+bs*3];
+ x2[4] = alpha * pD2[4+bs*4];
+ x2[5] = alpha * pD2[5+bs*5];
+ x2[6] = alpha * pD2[6+bs*6];
+ x2[7] = alpha * pD2[7+bs*7];
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+bs*0] += alpha * x2[0];
+ pD2[1+bs*1] += alpha * x2[1];
+ pD2[2+bs*2] += alpha * x2[2];
+ pD2[3+bs*3] += alpha * x2[3];
+ pD2[4+bs*4] += alpha * x2[4];
+ pD2[5+bs*5] += alpha * x2[5];
+ pD2[6+bs*6] += alpha * x2[6];
+ pD2[7+bs*7] += alpha * x2[7];
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+ }
+ return;
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract diagonal to vector, sparse formulation
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// insert vector to row
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[0*bs] = alpha * x[0];
+ pD[1*bs] = alpha * x[1];
+ pD[2*bs] = alpha * x[2];
+ pD[3*bs] = alpha * x[3];
+ pD += 4*bs;
+ x += 4;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll*bs] = alpha*x[ll];
+ }
+ return;
+ }
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+ {
+
+ const int bs = 8;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[0] = alpha * pD[0*bs];
+ x[1] = alpha * pD[1*bs];
+ x[2] = alpha * pD[2*bs];
+ x[3] = alpha * pD[3*bs];
+ pD += 4*bs;
+ x += 4;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[ll] = alpha*pD[ll*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to row
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[0*bs] += alpha * x[0];
+ pD[1*bs] += alpha * x[1];
+ pD[2*bs] += alpha * x[2];
+ pD[3*bs] += alpha * x[3];
+ pD += 4*bs;
+ x += 4;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll*bs] += alpha * x[ll];
+ }
+ return;
+ }
+
+
+
+// insert vector to row, sparse formulation
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = alpha*x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to row, sparse formulation
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+ {
+
+ const int bs = 8;
+
+ int ii;
+ float tmp;
+
+ for(ii=0; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[0+bs*1];
+ pA[0+bs*1] = pC[0+bs*1];
+ pC[0+bs*1] = tmp;
+ tmp = pA[0+bs*2];
+ pA[0+bs*2] = pC[0+bs*2];
+ pC[0+bs*2] = tmp;
+ tmp = pA[0+bs*3];
+ pA[0+bs*3] = pC[0+bs*3];
+ pC[0+bs*3] = tmp;
+ pA += 4*bs;
+ pC += 4*bs;
+ }
+ for( ; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1*bs;
+ pC += 1*bs;
+ }
+ return;
+ }
+
+
+
+// insert vector to column
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD[0] = x[0];
+ pD[1] = x[1];
+ pD[2] = x[2];
+ pD[3] = x[3];
+ pD[4] = x[4];
+ pD[5] = x[5];
+ pD[6] = x[6];
+ pD[7] = x[7];
+ pD += bs*sdd;
+ x += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+
+ }
+
+
+
+// add scaled vector to column
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD[0] += alpha * x[0];
+ pD[1] += alpha * x[1];
+ pD[2] += alpha * x[2];
+ pD[3] += alpha * x[3];
+ pD[4] += alpha * x[4];
+ pD[5] += alpha * x[5];
+ pD[6] += alpha * x[6];
+ pD[7] += alpha * x[7];
+ pD += bs*sdd;
+ x += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+ const int bs = 8;
+
+ int ii;
+
+ float tmp;
+
+ if(offsetA==offsetC)
+ {
+ if(offsetA>0)
+ {
+ ii = 0;
+ for(; ii<bs-offsetA; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ pA += bs*(sda-1);
+ pC += bs*(sdc-1);
+ kmax -= bs-offsetA;
+ }
+ ii = 0;
+ for(; ii<kmax-7; ii+=8)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[1+bs*0];
+ pA[1+bs*0] = pC[1+bs*0];
+ pC[1+bs*0] = tmp;
+ tmp = pA[2+bs*0];
+ pA[2+bs*0] = pC[2+bs*0];
+ pC[2+bs*0] = tmp;
+ tmp = pA[3+bs*0];
+ pA[3+bs*0] = pC[3+bs*0];
+ pC[3+bs*0] = tmp;
+ tmp = pA[4+bs*0];
+ pA[4+bs*0] = pC[4+bs*0];
+ pC[4+bs*0] = tmp;
+ tmp = pA[5+bs*0];
+ pA[5+bs*0] = pC[5+bs*0];
+ pC[5+bs*0] = tmp;
+ tmp = pA[6+bs*0];
+ pA[6+bs*0] = pC[6+bs*0];
+ pC[6+bs*0] = tmp;
+ tmp = pA[7+bs*0];
+ pA[7+bs*0] = pC[7+bs*0];
+ pC[7+bs*0] = tmp;
+ pA += bs*sda;
+ pC += bs*sdc;
+ }
+ for(; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ }
+ else
+ {
+ printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+ exit(1);
+ }
+
+ return;
+
+ }
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] = x[jj];
+ }
+
+ }
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] += alpha * x[jj];
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+ {
+ const int bs = 8;
+ int nc = S_NC;
+ int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = (pm*cn+tmp)*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+ {
+ const int bs = 8;
+ int nc = S_NC;
+ int al = bs*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = tmp*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+ {
+ const int bs = 8;
+ int nc = S_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ float *ptr = (float *) memory;
+ sA->pA = ptr;
+ ptr += pm*cn;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+ {
+ const int bs = 8;
+// int nc = S_NC;
+// int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int memory_size = pm*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+ {
+ const int bs = 8;
+// int nc = S_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ float *ptr = (float *) memory;
+ sa->pa = ptr;
+// ptr += pm;
+ sa->memory_size = pm*sizeof(float);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, jj, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>m)
+ m0 = m;
+ m1 = m - m0;
+ jj = 0;
+ for( ; jj<n-3; jj+=4)
+ {
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ pB[ii+bs*1] = B[ii+lda*1];
+ pB[ii+bs*2] = B[ii+lda*2];
+ pB[ii+bs*3] = B[ii+lda*3];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-7; ii+=8)
+ {
+ // unroll 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ pB[4+bs*0] = B[4+lda*0];
+ pB[5+bs*0] = B[5+lda*0];
+ pB[6+bs*0] = B[6+lda*0];
+ pB[7+bs*0] = B[7+lda*0];
+ // unroll 1
+ pB[0+bs*1] = B[0+lda*1];
+ pB[1+bs*1] = B[1+lda*1];
+ pB[2+bs*1] = B[2+lda*1];
+ pB[3+bs*1] = B[3+lda*1];
+ pB[4+bs*1] = B[4+lda*1];
+ pB[5+bs*1] = B[5+lda*1];
+ pB[6+bs*1] = B[6+lda*1];
+ pB[7+bs*1] = B[7+lda*1];
+ // unroll 2
+ pB[0+bs*2] = B[0+lda*2];
+ pB[1+bs*2] = B[1+lda*2];
+ pB[2+bs*2] = B[2+lda*2];
+ pB[3+bs*2] = B[3+lda*2];
+ pB[4+bs*2] = B[4+lda*2];
+ pB[5+bs*2] = B[5+lda*2];
+ pB[6+bs*2] = B[6+lda*2];
+ pB[7+bs*2] = B[7+lda*2];
+ // unroll 3
+ pB[0+bs*3] = B[0+lda*3];
+ pB[1+bs*3] = B[1+lda*3];
+ pB[2+bs*3] = B[2+lda*3];
+ pB[3+bs*3] = B[3+lda*3];
+ pB[4+bs*3] = B[4+lda*3];
+ pB[5+bs*3] = B[5+lda*3];
+ pB[6+bs*3] = B[6+lda*3];
+ pB[7+bs*3] = B[7+lda*3];
+ // update
+ B += 8;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ for( ; jj<n; jj++)
+ {
+
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-7; ii+=8)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ pB[4+bs*0] = B[4+lda*0];
+ pB[5+bs*0] = B[5+lda*0];
+ pB[6+bs*0] = B[6+lda*0];
+ pB[7+bs*0] = B[7+lda*0];
+ // update
+ B += 8;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>n)
+ m0 = n;
+ m1 = n - m0;
+ ii = 0;
+ if(m0>0)
+ {
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m0; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ A += m0*lda;
+ pA += m0 + bs*(sda-1);
+ }
+ ii = 0;
+ for(; ii<m1-7; ii+=bs)
+ {
+ j=0;
+ B = A + ii*lda;
+ pB = pA + ii*sda;
+ for(; j<m-3; j+=4)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ pB[4+0*bs] = B[0+4*lda];
+ pB[5+0*bs] = B[0+5*lda];
+ pB[6+0*bs] = B[0+6*lda];
+ pB[7+0*bs] = B[0+7*lda];
+ // unroll 1
+ pB[0+1*bs] = B[1+0*lda];
+ pB[1+1*bs] = B[1+1*lda];
+ pB[2+1*bs] = B[1+2*lda];
+ pB[3+1*bs] = B[1+3*lda];
+ pB[4+1*bs] = B[1+4*lda];
+ pB[5+1*bs] = B[1+5*lda];
+ pB[6+1*bs] = B[1+6*lda];
+ pB[7+1*bs] = B[1+7*lda];
+ // unroll 2
+ pB[0+2*bs] = B[2+0*lda];
+ pB[1+2*bs] = B[2+1*lda];
+ pB[2+2*bs] = B[2+2*lda];
+ pB[3+2*bs] = B[2+3*lda];
+ pB[4+2*bs] = B[2+4*lda];
+ pB[5+2*bs] = B[2+5*lda];
+ pB[6+2*bs] = B[2+6*lda];
+ pB[7+2*bs] = B[2+7*lda];
+ // unroll 3
+ pB[0+3*bs] = B[3+0*lda];
+ pB[1+3*bs] = B[3+1*lda];
+ pB[2+3*bs] = B[3+2*lda];
+ pB[3+3*bs] = B[3+3*lda];
+ pB[4+3*bs] = B[3+4*lda];
+ pB[5+3*bs] = B[3+5*lda];
+ pB[6+3*bs] = B[3+6*lda];
+ pB[7+3*bs] = B[3+7*lda];
+ B += 4;
+ pB += 4*bs;
+ }
+ for(; j<m; j++)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ pB[4+0*bs] = B[0+4*lda];
+ pB[5+0*bs] = B[0+5*lda];
+ pB[6+0*bs] = B[0+6*lda];
+ pB[7+0*bs] = B[0+7*lda];
+ B += 1;
+ pB += 1*bs;
+ }
+ }
+ if(ii<m1)
+ {
+ m2 = m1-ii;
+ if(bs<m2) m2 = bs;
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m2; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ // TODO update A !!!!!
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+ A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+ A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+ A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+ // unroll 0
+ A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+ A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+ A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+ A[4+ii+lda*(jj+1)] = ptr_pA[4+bs*1];
+ A[5+ii+lda*(jj+1)] = ptr_pA[5+bs*1];
+ A[6+ii+lda*(jj+1)] = ptr_pA[6+bs*1];
+ A[7+ii+lda*(jj+1)] = ptr_pA[7+bs*1];
+ // unroll 0
+ A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+ A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+ A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+ A[4+ii+lda*(jj+2)] = ptr_pA[4+bs*2];
+ A[5+ii+lda*(jj+2)] = ptr_pA[5+bs*2];
+ A[6+ii+lda*(jj+2)] = ptr_pA[6+bs*2];
+ A[7+ii+lda*(jj+2)] = ptr_pA[7+bs*2];
+ // unroll 0
+ A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+ A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+ A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+ A[4+ii+lda*(jj+3)] = ptr_pA[4+bs*3];
+ A[5+ii+lda*(jj+3)] = ptr_pA[5+bs*3];
+ A[6+ii+lda*(jj+3)] = ptr_pA[6+bs*3];
+ A[7+ii+lda*(jj+3)] = ptr_pA[7+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+ A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+ A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+ A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ // TODO update A !!!!!
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+ A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+ A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+ A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+ A[jj+0+lda*(ii+4)] = ptr_pA[4+bs*0];
+ A[jj+0+lda*(ii+5)] = ptr_pA[5+bs*0];
+ A[jj+0+lda*(ii+6)] = ptr_pA[6+bs*0];
+ A[jj+0+lda*(ii+7)] = ptr_pA[7+bs*0];
+ // unroll 1
+ A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+ A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+ A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+ A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+ A[jj+1+lda*(ii+4)] = ptr_pA[4+bs*1];
+ A[jj+1+lda*(ii+5)] = ptr_pA[5+bs*1];
+ A[jj+1+lda*(ii+6)] = ptr_pA[6+bs*1];
+ A[jj+1+lda*(ii+7)] = ptr_pA[7+bs*1];
+ // unroll 2
+ A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+ A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+ A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+ A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+ A[jj+2+lda*(ii+4)] = ptr_pA[4+bs*2];
+ A[jj+2+lda*(ii+5)] = ptr_pA[5+bs*2];
+ A[jj+2+lda*(ii+6)] = ptr_pA[6+bs*2];
+ A[jj+2+lda*(ii+7)] = ptr_pA[7+bs*2];
+ // unroll 3
+ A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+ A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+ A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+ A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+ A[jj+3+lda*(ii+4)] = ptr_pA[4+bs*3];
+ A[jj+3+lda*(ii+5)] = ptr_pA[5+bs*3];
+ A[jj+3+lda*(ii+6)] = ptr_pA[6+bs*3];
+ A[jj+3+lda*(ii+7)] = ptr_pA[7+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ i=0;
+ // TODO update A !!!!!
+ // TODO unroll !!!!!!
+ for(; i<bs; i++)
+ {
+ A[jj+lda*(i+ii)] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+ int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+ int ii, jj;
+ if(m0>0)
+ {
+ for(ii=0; ii<m0; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ m -= m0;
+ }
+ for(ii=0; ii<m-7; ii+=8)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[0+jj*bs] = alpha;
+ pA[1+jj*bs] = alpha;
+ pA[2+jj*bs] = alpha;
+ pA[3+jj*bs] = alpha;
+ pA[4+jj*bs] = alpha;
+ pA[5+jj*bs] = alpha;
+ pA[6+jj*bs] = alpha;
+ pA[7+jj*bs] = alpha;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ srowsw_lib(kmax, pA, pC);
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowex_lib(kmax, alpha, pA, x);
+ return;
+ }
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowin_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowad_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ int offsetA = ai%bs;
+
+ int ii, mna;
+
+ if(offsetA>0)
+ {
+ mna = bs-offsetA;
+ mna = m<mna ? m : mna;
+ kernel_sgesc_8_gen_lib8(n, &alpha, &pA[offsetA], mna);
+ m -= mna;
+ pA += 8*sda;
+ }
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgesc_8_lib8(n, &alpha, &pA[0]);
+ pA += 8*sda;
+ }
+ if(ii<m)
+ {
+ kernel_sgesc_8_gen_lib8(n, &alpha, &pA[0], m-ii);
+ }
+
+ return;
+
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: m x n
+ if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offsetA = ai%bs;
+ int offsetB = bi%bs;
+
+ int ii, mna;
+
+#if 1
+ if(offsetB>0)
+ {
+ if(offsetB>offsetA)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_0_gen_lib8(n, &pA[offsetA], &pB[offsetB], mna);
+ m -= mna;
+ //pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else
+ {
+ if(offsetA==0)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_0_gen_lib8(n, &pA[0], &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==1)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_1_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==2)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_2_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==3)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_3_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==4)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_4_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==5)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_5_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==6)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_6_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==7)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_7_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ }
+ }
+#endif
+
+ // same alignment
+ if(offsetA==offsetB)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_0_lib8(n, pA, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_0_gen_lib8(n, pA, pB, m-ii);
+ }
+ return;
+ }
+ // XXX different alignment: search tree ???
+ // skip one element of A
+ else if(offsetA==(offsetB+1)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_1_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_1_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ }
+ // skip two elements of A
+ else if(offsetA==(offsetB+2)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_2_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_2_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip three elements of A
+ else if(offsetA==(offsetB+3)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_3_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_3_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip four elements of A
+ else if(offsetA==(offsetB+4)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_4_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_4_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip five elements of A
+ else if(offsetA==(offsetB+5)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_5_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_5_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip six elements of A
+ else if(offsetA==(offsetB+6)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_6_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_6_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip seven elements of A
+ else //if(offsetA==(offsetB+7)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_7_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_7_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+
+ return;
+
+ }
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strcp_l_lib(m, ai%bs, pA, sda, ci%bs, pC, sdc);
+ // XXX uses full matrix copy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+// sgecp_libstr(m, m, sA, ai, aj, sC, ci, cj);
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgead_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgead_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgead_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgead_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgead_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgead_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgead_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgead_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: m x n
+ if(bi+m > sB->m) printf("\n***** sgead_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgead_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offsetA = ai%bs;
+ int offsetB = bi%bs;
+
+ int ii, mna;
+
+#if 1
+ if(offsetB>0)
+ {
+ if(offsetB>offsetA)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[offsetA], &pB[offsetB], mna);
+ m -= mna;
+ //pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else
+ {
+ if(offsetA==0)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[0], &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==1)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_1_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==2)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_2_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==3)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_3_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==4)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_4_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==5)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_5_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==6)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_6_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==7)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_7_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ }
+ }
+#endif
+
+ // same alignment
+ if(offsetA==offsetB)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_0_lib8(n, &alpha, pA, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_0_gen_lib8(n, &alpha, pA, pB, m-ii);
+ }
+ return;
+ }
+ // XXX different alignment: search tree ???
+ // skip one element of A
+ else if(offsetA==(offsetB+1)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_1_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_1_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ }
+ // skip two elements of A
+ else if(offsetA==(offsetB+2)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_2_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_2_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip three elements of A
+ else if(offsetA==(offsetB+3)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_3_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_3_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip four elements of A
+ else if(offsetA==(offsetB+4)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_4_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_4_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip five elements of A
+ else if(offsetA==(offsetB+5)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_5_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_5_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip six elements of A
+ else if(offsetA==(offsetB+6)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_6_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_6_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip seven elements of A
+ else //if(offsetA==(offsetB+7)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_7_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_7_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+
+ return;
+
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgetr_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgetr_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgetr_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgetr_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgetr_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgetr_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgetr_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgetr_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: n x m
+ if(bi+n > sB->m) printf("\n***** sgetr_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+ if(bj+m > sB->n) printf("\n***** sgetr_libstr : bj+m > col(B) : %d+%d > %d *****\n", bj, m, sB->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offsetA = ai%bs;
+ int offsetB = bi%bs;
+
+ int ii, nna;
+
+ if(offsetA==0)
+ {
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_0_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for(ii=0; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_0_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_0_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ // TODO log serach for offsetA>0 ???
+ else if(offsetA==1)
+ {
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_1_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for(ii=0; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_1_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_1_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==2)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_2_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_2_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_2_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==3)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_3_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_3_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_3_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==4)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_4_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_4_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_4_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==5)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_5_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_5_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_5_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==6)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_6_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_6_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_6_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==7)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_7_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_7_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_7_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+ srowad_libsp(kmax, idx, alpha, x, pD);
+ return;
+ }
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ svecad_libsp(kmax, idx, alpha, x, y);
+ return;
+ }
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/auxiliary/v_aux_ext_dep_lib.c b/auxiliary/v_aux_ext_dep_lib.c
new file mode 100644
index 0000000..3bf5f90
--- /dev/null
+++ b/auxiliary/v_aux_ext_dep_lib.c
@@ -0,0 +1,138 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+
+
+/* creates a zero matrix given the size in bytes */
+void v_zeros(void **ptrA, int size)
+ {
+ *ptrA = (void *) malloc(size);
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void v_zeros_align(void **ptrA, int size)
+ {
+#if defined(OS_WINDOWS)
+ *ptrA = _aligned_malloc( size, 64 );
+#else
+ int err = posix_memalign(ptrA, 64, size);
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+#endif
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* frees matrix */
+void v_free(void *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void v_free_align(void *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* creates a zero matrix given the size in bytes */
+void c_zeros(char **ptrA, int size)
+ {
+ *ptrA = malloc(size);
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void c_zeros_align(char **ptrA, int size)
+ {
+#if defined(OS_WINDOWS)
+ *ptrA = _aligned_malloc( size, 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, size);
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *ptrA = temp;
+#endif
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* frees matrix */
+void c_free(char *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void c_free_align(char *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
diff --git a/blas/Makefile b/blas/Makefile
new file mode 100644
index 0000000..304b448
--- /dev/null
+++ b/blas/Makefile
@@ -0,0 +1,88 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib8.o s_blas2_lib8.o s_blas2_diag_lib.o s_blas3_lib8.o s_blas3_diag_lib8.o s_lapack_lib8.o
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib8.o s_blas2_lib8.o s_blas2_diag_lib.o s_blas3_lib8.o s_blas3_diag_lib8.o s_lapack_lib8.o
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), GENERIC)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_blas1_lib.o d_blas2_lib.o d_blas2_diag_lib.o d_blas3_lib.o d_blas3_diag_lib.o d_lapack_lib.o
+OBJS += s_blas1_lib.o s_blas2_lib.o s_blas2_diag_lib.o s_blas3_lib.o s_blas3_diag_lib.o s_lapack_lib.o
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
+d_blas1_lib.o: d_blas1_lib.c x_blas1_lib.c
+s_blas1_lib.o: s_blas1_lib.c x_blas1_lib.c
+d_blas2_lib.o: d_blas2_lib.c x_blas2_lib.c
+s_blas2_lib.o: s_blas2_lib.c x_blas2_lib.c
+d_blas2_diag_lib.o: d_blas2_diag_lib.c x_blas2_diag_lib.c
+s_blas2_diag_lib.o: s_blas2_diag_lib.c x_blas2_diag_lib.c
+d_blas3_lib.o: d_blas3_lib.c x_blas3_lib.c
+s_blas3_lib.o: s_blas3_lib.c x_blas3_lib.c
+d_blas3_diag_lib.o: d_blas3_diag_lib.c x_blas3_diag_lib.c
+s_blas3_diag_lib.o: s_blas3_diag_lib.c x_blas3_diag_lib.c
+d_lapack_lib.o: d_lapack_lib.c x_lapack_lib.c
+s_lapack_lib.o: s_lapack_lib.c x_lapack_lib.c
diff --git a/blas/d_blas.h b/blas/d_blas.h
new file mode 100644
index 0000000..fc5058b
--- /dev/null
+++ b/blas/d_blas.h
@@ -0,0 +1,66 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void dcopy_(int *m, double *x, int *incx, double *y, int *incy);
+void daxpy_(int *m, double *alpha, double *x, int *incx, double *y, int *incy);
+void dscal_(int *m, double *alpha, double *x, int *incx);
+
+// level 2
+void dgemv_(char *ta, int *m, int *n, double *alpha, double *A, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+void dsymv_(char *uplo, int *m, double *alpha, double *A, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+void dtrmv_(char *uplo, char *trans, char *diag, int *n, double *A, int *lda, double *x, int *incx);
+void dtrsv_(char *uplo, char *trans, char *diag, int *n, double *A, int *lda, double *x, int *incx);
+void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *A, int *lda);
+
+// level 3
+void dgemm_(char *ta, char *tb, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc);
+void dsyrk_(char *uplo, char *trans, int *n, int *k, double *alpha, double *A, int *lda, double *beta, double *C, int *ldc);
+void dtrmm_(char *side, char *uplo, char *trans, char *diag, int *m, int *n, double *alpha, double *A, int *lda, double *B, int *ldb);
+void dtrsm_(char *side, char *uplo, char *trans, char *diag, int *m, int *n, double *alpha, double *A, int *lda, double *B, int *ldb);
+
+// lapack
+int dpotrf_(char *uplo, int *m, double *A, int *lda, int *info);
+int dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
+void dgeqrf_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *lwork, int *info);
+void dgeqr2_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *info);
+void dgelqf_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *lwork, int *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/d_blas1_lib.c b/blas/d_blas1_lib.c
new file mode 100644
index 0000000..1fd19d3
--- /dev/null
+++ b/blas/d_blas1_lib.c
@@ -0,0 +1,54 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "d_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define AXPY_LIBSTR daxpy_libstr
+#define VECMULDOT_LIBSTR dvecmuldot_libstr
+#define DOT_LIBSTR ddot_libstr
+
+#define AXPY daxpy_
+#define COPY dcopy_
+
+
+#include "x_blas1_lib.c"
diff --git a/blas/d_blas1_lib4.c b/blas/d_blas1_lib4.c
new file mode 100644
index 0000000..a4155a9
--- /dev/null
+++ b/blas/d_blas1_lib4.c
@@ -0,0 +1,263 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void daxpy_libstr(int m, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m256d
+ v_alpha, v_tmp,
+ v_x0, v_y0,
+ v_x1, v_y1;
+#endif
+
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ v_alpha = _mm256_broadcast_sd( &alpha );
+ for( ; ii<m-7; ii+=8)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_x1 = _mm256_loadu_pd( &x[ii+4] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+ v_y1 = _mm256_loadu_pd( &y[ii+4] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_y0 = _mm256_fmadd_pd( v_alpha, v_x0, v_y0 );
+ v_y1 = _mm256_fmadd_pd( v_alpha, v_x1, v_y1 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_alpha, v_x0 );
+ v_y0 = _mm256_add_pd( v_tmp, v_y0 );
+ v_tmp = _mm256_mul_pd( v_alpha, v_x1 );
+ v_y1 = _mm256_add_pd( v_tmp, v_y1 );
+#endif
+ _mm256_storeu_pd( &z[ii+0], v_y0 );
+ _mm256_storeu_pd( &z[ii+4], v_y1 );
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii] );
+ v_y0 = _mm256_loadu_pd( &y[ii] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_y0 = _mm256_fmadd_pd( v_alpha, v_x0, v_y0 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_alpha, v_x0 );
+ v_y0 = _mm256_add_pd( v_tmp, v_y0 );
+#endif
+ _mm256_storeu_pd( &z[ii], v_y0 );
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+#endif
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+double dvecmuldot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+ int ii;
+ double dot = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m128d
+ u_tmp, u_dot;
+ __m256d
+ v_tmp,
+ v_x0, v_y0, v_z0;
+
+ v_tmp = _mm256_setzero_pd();
+#endif
+
+ ii = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<m-3; ii+=4)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+ v_z0 = _mm256_mul_pd( v_x0, v_y0 );
+ _mm256_storeu_pd( &z[ii+0], v_z0 );
+ v_tmp = _mm256_add_pd( v_tmp, v_z0 );
+ }
+#endif
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ // dot product
+ u_tmp = _mm_add_pd( _mm256_castpd256_pd128( v_tmp ), _mm256_extractf128_pd( v_tmp, 0x1 ) );
+ u_tmp = _mm_hadd_pd( u_tmp, u_tmp);
+ u_dot = _mm_load_sd( &dot );
+ u_dot = _mm_add_sd( u_dot, u_tmp );
+ _mm_store_sd( &dot, u_dot );
+#endif
+ return dot;
+ }
+
+
+
+// compute dot product of two vectors
+double ddot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ int ii;
+ double dot = 0.0;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m128d
+ u_dot0, u_x0, u_y0, u_tmp;
+ __m256d
+ v_dot0, v_dot1, v_x0, v_x1, v_y0, v_y1, v_tmp;
+
+ v_dot0 = _mm256_setzero_pd();
+ v_dot1 = _mm256_setzero_pd();
+ u_dot0 = _mm_setzero_pd();
+
+ ii = 0;
+ for(; ii<m-7; ii+=8)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_x1 = _mm256_loadu_pd( &x[ii+4] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+ v_y1 = _mm256_loadu_pd( &y[ii+4] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_dot0 = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
+ v_dot1 = _mm256_fmadd_pd( v_x1, v_y1, v_dot1 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_x0, v_y0 );
+ v_dot0 = _mm256_add_pd( v_dot0, v_tmp );
+ v_tmp = _mm256_mul_pd( v_x1, v_y1 );
+ v_dot1 = _mm256_add_pd( v_dot1, v_tmp );
+#endif
+ }
+ for(; ii<m-3; ii+=4)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_dot0 = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_x0, v_y0 );
+ v_dot0 = _mm256_add_pd( v_dot0, v_tmp );
+#endif
+ }
+ for(; ii<m; ii++)
+ {
+ u_x0 = _mm_load_sd( &x[ii+0] );
+ u_y0 = _mm_load_sd( &y[ii+0] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ u_dot0 = _mm_fmadd_sd( u_x0, u_y0, u_dot0 );
+#else // sandy bridge
+ u_tmp = _mm_mul_sd( u_x0, u_y0 );
+ u_dot0 = _mm_add_sd( u_dot0, u_tmp );
+#endif
+ }
+ // reduce
+ v_dot0 = _mm256_add_pd( v_dot0, v_dot1 );
+ u_tmp = _mm_add_pd( _mm256_castpd256_pd128( v_dot0 ), _mm256_extractf128_pd( v_dot0, 0x1 ) );
+ u_tmp = _mm_hadd_pd( u_tmp, u_tmp);
+ u_dot0 = _mm_add_sd( u_dot0, u_tmp );
+ _mm_store_sd( &dot, u_dot0 );
+#else // no haswell, no sandy bridge
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ dot += x[ii+0] * y[ii+0];
+ dot += x[ii+1] * y[ii+1];
+ dot += x[ii+2] * y[ii+2];
+ dot += x[ii+3] * y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ dot += x[ii+0] * y[ii+0];
+ }
+#endif // haswell, sandy bridge
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/d_blas2_diag_lib.c b/blas/d_blas2_diag_lib.c
new file mode 100644
index 0000000..8bc3f68
--- /dev/null
+++ b/blas/d_blas2_diag_lib.c
@@ -0,0 +1,45 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRVEC d_strvec
+
+#define GEMV_DIAG_LIBSTR dgemv_diag_libstr
+
+
+
+#include "x_blas2_diag_lib.c"
diff --git a/blas/d_blas2_lib.c b/blas/d_blas2_lib.c
new file mode 100644
index 0000000..9c39fe2
--- /dev/null
+++ b/blas/d_blas2_lib.c
@@ -0,0 +1,71 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "d_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GEMV_N_LIBSTR dgemv_n_libstr
+#define GEMV_NT_LIBSTR dgemv_nt_libstr
+#define GEMV_T_LIBSTR dgemv_t_libstr
+#define SYMV_L_LIBSTR dsymv_l_libstr
+#define TRMV_LNN_LIBSTR dtrmv_lnn_libstr
+#define TRMV_LTN_LIBSTR dtrmv_ltn_libstr
+#define TRMV_UNN_LIBSTR dtrmv_unn_libstr
+#define TRMV_UTN_LIBSTR dtrmv_utn_libstr
+#define TRSV_LNN_LIBSTR dtrsv_lnn_libstr
+#define TRSV_LNN_MN_LIBSTR dtrsv_lnn_mn_libstr
+#define TRSV_LNU_LIBSTR dtrsv_lnu_libstr
+#define TRSV_LTN_LIBSTR dtrsv_ltn_libstr
+#define TRSV_LTN_MN_LIBSTR dtrsv_ltn_mn_libstr
+#define TRSV_LTU_LIBSTR dtrsv_ltu_libstr
+#define TRSV_UNN_LIBSTR dtrsv_unn_libstr
+#define TRSV_UTN_LIBSTR dtrsv_utn_libstr
+
+#define COPY dcopy_
+#define GEMV dgemv_
+#define SYMV dsymv_
+#define TRMV dtrmv_
+#define TRSV dtrsv_
+
+
+
+#include "x_blas2_lib.c"
diff --git a/blas/d_blas2_lib4.c b/blas/d_blas2_lib4.c
new file mode 100644
index 0000000..cab8e3c
--- /dev/null
+++ b/blas/d_blas2_lib4.c
@@ -0,0 +1,1060 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ // suppose m>=n
+ if(m<n)
+ m = n;
+
+ const int bs = 4;
+
+ double alpha = -1.0;
+ double beta = 1.0;
+
+ int i;
+
+ if(x!=y)
+ {
+ for(i=0; i<m; i++)
+ y[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<n-3; i+=4)
+ {
+ kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
+ }
+ if(i<n)
+ {
+ kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
+ i+=4;
+ }
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-7; i+=8)
+ {
+ kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
+ }
+ if(i<m-3)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
+ i+=4;
+ }
+#else
+ for( ; i<m-3; i+=4)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
+ }
+#endif
+ if(i<m)
+ {
+ kernel_dgemv_n_4_gen_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], 0, m-i);
+ i+=4;
+ }
+
+ }
+
+
+
+void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(n>m)
+ n = m;
+
+ const int bs = 4;
+
+ int i;
+
+ if(x!=y)
+ for(i=0; i<m; i++)
+ y[i] = x[i];
+
+ i=0;
+ if(n%4==1)
+ {
+ kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
+ i++;
+ }
+ else if(n%4==2)
+ {
+ kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
+ i+=2;
+ }
+ else if(n%4==3)
+ {
+ kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
+ i+=3;
+ }
+ for(; i<n-3; i+=4)
+ {
+ kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
+ }
+
+ }
+
+
+
+void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii;
+
+ // copy and scale y_n int z_n
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ z_n[ii+1] = beta_n*y_n[ii+1];
+ z_n[ii+2] = beta_n*y_n[ii+2];
+ z_n[ii+3] = beta_n*y_n[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ }
+
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<n-5; ii+=6)
+ {
+ kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+#endif
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+ if(ii<n)
+ {
+ kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+ }
+
+ return;
+
+ }
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+ pA += bs*sda;
+ y += 4 - ai%bs;
+ z += 4 - ai%bs;
+ m -= 4 - ai%bs;
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; i<m-11; i+=12)
+ {
+ kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
+ }
+#endif
+ for( ; i<m-7; i+=8)
+ {
+ kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<m-3)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ i+=4;
+ }
+#else
+ for( ; i<m-3; i+=4)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ }
+#endif
+ if(i<m)
+ {
+ kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+ }
+
+ return;
+
+ }
+
+
+
+void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ if(ai%bs==0)
+ {
+ i = 0;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; i<n-11; i+=12)
+ {
+ kernel_dgemv_t_12_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+#endif
+ for( ; i<n-7; i+=8)
+ {
+ kernel_dgemv_t_8_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<n-3)
+ {
+ kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ i+=4;
+ }
+#else
+ for( ; i<n-3; i+=4)
+ {
+ kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+#endif
+ if(i<n)
+ {
+ kernel_dgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+ else // TODO kernel 8
+ {
+ i = 0;
+ for( ; i<n; i+=4)
+ {
+ kernel_dgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t)
+ {
+ if(ai!=0)
+ {
+ printf("\ndgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *x_n = sx_n->pa + xi_n;
+ double *x_t = sx_t->pa + xi_t;
+ double *y_n = sy_n->pa + yi_n;
+ double *y_t = sy_t->pa + yi_t;
+ double *z_n = sz_n->pa + zi_n;
+ double *z_t = sz_t->pa + zi_t;
+ dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
+ return;
+ }
+
+
+
+void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, n1;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ // copy and scale y int z
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = beta*y[ii+0];
+ z[ii+1] = beta*y[ii+1];
+ z[ii+2] = beta*y[ii+2];
+ z[ii+3] = beta*y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = beta*y[ii+0];
+ }
+
+ // clean up at the beginning
+ if(ai%bs!=0) // 1, 2, 3
+ {
+ n1 = 4-ai%bs;
+ kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
+ pA += n1 + n1*bs + (sda-1)*bs;
+ x += n1;
+ z += n1;
+ m -= n1;
+ n -= n1;
+ }
+ // main loop
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
+ }
+ // clean up at the end
+ if(ii<n)
+ {
+ kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
+ }
+
+ return;
+ }
+
+
+
+// m >= n
+void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ if(m-n>0)
+ dgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+ double *pA2 = pA;
+ double *z2 = z;
+ int m2 = n;
+ int n2 = 0;
+ double *pA3, *x3;
+
+ double alpha = 1.0;
+ double beta = 1.0;
+
+ double zt[4];
+
+ int ii, jj, jj_end;
+
+ ii = 0;
+
+ if(ai%4!=0)
+ {
+ pA2 += sda*bs - ai%bs;
+ z2 += bs-ai%bs;
+ m2 -= bs-ai%bs;
+ n2 += bs-ai%bs;
+ }
+
+ pA2 += m2/bs*bs*sda;
+ z2 += m2/bs*bs;
+ n2 += m2/bs*bs;
+
+ if(m2%bs!=0)
+ {
+ //
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ zt[0] = pA3[0+bs*0]*x3[0];
+ kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
+ for(jj=0; jj<m2%bs; jj++)
+ z2[jj] = zt[jj];
+ }
+ for(; ii<m2-3; ii+=4)
+ {
+ pA2 -= bs*sda;
+ z2 -= 4;
+ n2 -= 4;
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ z2[0] = pA3[0+bs*0]*x3[0];
+ kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
+ }
+ if(ai%4!=0)
+ {
+ if(ai%bs==1)
+ {
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==2)
+ {
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else // if (ai%bs==3)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ double xt[4];
+ double zt[4];
+
+ double alpha = 1.0;
+ double beta = 1.0;
+
+ int ii, jj, ll, ll_max;
+
+ jj = 0;
+
+ if(ai%bs!=0)
+ {
+
+ if(ai%bs==1)
+ {
+ ll_max = m-jj<3 ? m-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<3; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+ zt[2] = pA[2+bs*2]*xt[2];
+ pA += bs*sda - 1;
+ x += 3;
+ kernel_dgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<3 ? n-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*3;
+ z += 3;
+ jj += 3;
+ }
+ else if(ai%bs==2)
+ {
+ ll_max = m-jj<2 ? m-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<2; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+ zt[1] = pA[1+bs*1]*xt[1];
+ pA += bs*sda - 2;
+ x += 2;
+ kernel_dgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<2 ? n-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*2;
+ z += 2;
+ jj += 2;
+ }
+ else // if(ai%bs==3)
+ {
+ ll_max = m-jj<1 ? m-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<1; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0];
+ pA += bs*sda - 3;
+ x += 1;
+ kernel_dgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<1 ? n-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*1;
+ z += 1;
+ jj += 1;
+ }
+
+ }
+
+ for(; jj<n-3; jj+=4)
+ {
+ zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
+ zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
+ zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
+ zt[3] = pA[3+bs*3]*x[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
+ pA += bs*4;
+ z += 4;
+ }
+ if(jj<n)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*4;
+// z += 4;
+ }
+
+ return;
+
+ }
+
+
+
+void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ int i;
+
+ i=0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
+ pA += 8*sda+8*bs;
+ x += 8;
+ z += 8;
+ }
+#endif
+ for(; i<m-3; i+=4)
+ {
+ kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
+ pA += 4*sda+4*bs;
+ x += 4;
+ z += 4;
+ }
+ if(m>i)
+ {
+ if(m-i==1)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ else if(m-i==2)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
+ z[1] = pA[1+bs*1]*x[1];
+ }
+ else // if(m-i==3)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
+ z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
+ z[2] = pA[2+bs*2]*x[2];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\ndtrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ int ii, idx;
+
+ double *ptrA;
+
+ ii=0;
+ idx = m/bs*bs;
+ if(m%bs!=0)
+ {
+ kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
+ ii += m%bs;
+ }
+ idx -= 4;
+ for(; ii<m; ii+=4)
+ {
+ kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
+ idx -= 4;
+ }
+
+ return;
+
+ }
+
+
+
+void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0 | n==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** dtrsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** dtrsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** dtrsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** dtrsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_lnu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_ltu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_unn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_utn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/d_blas3_diag_lib.c b/blas/d_blas3_diag_lib.c
new file mode 100644
index 0000000..ff69317
--- /dev/null
+++ b/blas/d_blas3_diag_lib.c
@@ -0,0 +1,47 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GEMM_R_DIAG_LIBSTR dgemm_r_diag_libstr
+#define GEMM_L_DIAG_LIBSTR dgemm_l_diag_libstr
+
+
+
+#include "x_blas3_diag_lib.c"
diff --git a/blas/d_blas3_diag_lib4.c b/blas/d_blas3_diag_lib4.c
new file mode 100644
index 0000000..2731d1f
--- /dev/null
+++ b/blas/d_blas3_diag_lib4.c
@@ -0,0 +1,184 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void dgemm_diag_left_lib(int m, int n, double alpha, double *dA, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgemm_diag_left_4_a0_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &pD[ii*sdd]);
+ }
+ }
+ else
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgemm_diag_left_4_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii==1)
+ kernel_dgemm_diag_left_1_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else if(m-ii==2)
+ kernel_dgemm_diag_left_2_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else // if(m-ii==3)
+ kernel_dgemm_diag_left_3_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+
+ }
+
+
+
+void dgemm_diag_right_lib(int m, int n, double alpha, double *pA, int sda, double *dB, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+ }
+ }
+ else
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ }
+ if(n-ii>0)
+ {
+ if(n-ii==1)
+ kernel_dgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else if(n-ii==2)
+ kernel_dgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else // if(n-ii==3)
+ kernel_dgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void dgemm_l_diag_libstr(int m, int n, double alpha, struct d_strvec *sA, int ai, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ if(bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\ndgemm_l_diag_libstr: feature not implemented yet: bi=%d, ci=%d, di=%d\n", bi, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *dA = sA->pa + ai;
+ double *pB = sB->pA + bj*bs;
+ double *pC = sC->pA + cj*bs;
+ double *pD = sD->pA + dj*bs;
+ dgemm_diag_left_lib(m, n, alpha, dA, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void dgemm_r_diag_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sB, int bi, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ if(ai!=0 | ci!=0 | di!=0)
+ {
+ printf("\ndgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*bs;
+ double *dB = sB->pa + bi;
+ double *pC = sC->pA + cj*bs;
+ double *pD = sD->pA + dj*bs;
+ dgemm_diag_right_lib(m, n, alpha, pA, sda, dB, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/d_blas3_lib.c b/blas/d_blas3_lib.c
new file mode 100644
index 0000000..27c20ab
--- /dev/null
+++ b/blas/d_blas3_lib.c
@@ -0,0 +1,69 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "d_blas_64.h"
+#else
+#include "d_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+
+#define GEMM_NN_LIBSTR dgemm_nn_libstr
+#define GEMM_NT_LIBSTR dgemm_nt_libstr
+#define SYRK_LN_LIBSTR dsyrk_ln_libstr
+#define SYRK_LN_MN_LIBSTR dsyrk_ln_mn_libstr
+#define TRMM_RLNN_LIBSTR dtrmm_rlnn_libstr
+#define TRMM_RUTN_LIBSTR dtrmm_rutn_libstr
+#define TRSM_LLNU_LIBSTR dtrsm_llnu_libstr
+#define TRSM_LUNN_LIBSTR dtrsm_lunn_libstr
+#define TRSM_RLTN_LIBSTR dtrsm_rltn_libstr
+#define TRSM_RLTU_LIBSTR dtrsm_rltu_libstr
+#define TRSM_RUTN_LIBSTR dtrsm_rutn_libstr
+
+#define COPY dcopy_
+#define GEMM dgemm_
+#define SYRK dsyrk_
+#define TRMM dtrmm_
+#define TRSM dtrsm_
+
+
+
+#include "x_blas3_lib.c"
diff --git a/blas/d_blas3_lib4.c b/blas/d_blas3_lib4.c
new file mode 100644
index 0000000..dfa3cb8
--- /dev/null
+++ b/blas/d_blas3_lib4.c
@@ -0,0 +1,2728 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void dgemm_nt_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
+ }
+
+ if(j<n-4)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
+ }
+ else if(j<n)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n-4)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ else if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_4:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+#if 0
+void dgemm_nn_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], 0, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+#endif
+
+
+
+void dtrmm_nt_ru_lib(int m, int n, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j;
+
+ i = 0;
+// XXX there is a bug here !!!!!!
+#if 0//defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrmm_nt_ru_12x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<5)
+ {
+ goto left_4;
+ }
+ if(m-i<9)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrmm_nt_ru_8x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<5)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ // clean up
+ left_12:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+// }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ // clean up
+ left_8:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+// }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+// }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A lower triangular with unit diagonal
+void dtrsm_nt_rl_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_nt_ru_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, idx;
+
+ int rn = n%4;
+
+ double *dummy;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_12x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_8x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+#endif
+
+ left_4:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A lower triangular with unit diagonal
+void dtrsm_nn_ll_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-11; i+=12)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<=4)
+ goto left_4;
+ if(m-i<=8)
+ goto left_8;
+ else
+ goto left_12;
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; i<m-7; i+=8)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<=4)
+ goto left_4;
+ else
+ goto left_8;
+ }
+#else
+ for( ; i<m-3; i+=4)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_nn_lu_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, idx;
+ double *dummy;
+
+ i = 0;
+ int rm = m%4;
+ if(rm>0)
+ {
+ // TODO code expliticly the final case
+ idx = m-rm; // position of the part to do
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, inv_diag_A+idx, rm, n-j);
+ }
+ // TODO
+ i += rm;
+ }
+// int em = m-rm;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-8; i+=12)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12));
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12), 12, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, inv_diag_A+(idx-12), 4, n-j);
+ }
+ }
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-4; i+=8)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8));
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8), 8, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
+ }
+ }
+#endif
+ for( ; i<m; i+=4)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4));
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+ }
+ }
+
+ // common return
+ return;
+
+ }
+
+
+
+#if 0
+void dlauum_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ // TODO remove
+ double alpha, beta;
+ if(alg==0)
+ {
+ alpha = 1.0;
+ beta = 0.0;
+ }
+ else if(alg==1)
+ {
+ alpha = 1.0;
+ beta = 1.0;
+ }
+ else
+ {
+ alpha = -1.0;
+ beta = 1.0;
+ }
+
+ // TODO remove
+ int k = cv[nv-1];
+
+ const int ps = 4;
+
+ int i, j, l;
+ int ii, iii, jj, kii, kiii, kjj, k0, k1;
+
+ i = 0;
+ ii = 0;
+ iii = 0;
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-7; i+=8)
+ {
+
+ while(ii<nv && rv[ii]<i+8)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_nt_8x4_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], 4, n-j-4); // TODO
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+
+ while(ii<nv && rv[ii]<i+4)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+// k0 = kii;
+// printf("\nii %d %d %d %d %d\n", i, ii, rv[ii], cv[ii], kii);
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+// printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
+
+ kernel_dgemm_nt_4x4_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+// printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
+
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4); // TODO
+ }
+ }
+ }
+ return;
+#endif
+
+ left_4:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ return;
+
+ }
+#endif
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm nt
+void dgemm_nt_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ int air = ai & (ps-1);
+ int bir = bi & (ps-1);
+ double *pA = sA->pA + aj*ps + (ai-air)*sda;
+ double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+
+ if(ai==0 & bi==0 & ci==0 & di==0)
+ {
+ dgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+ int ci0 = ci-air;
+ int di0 = di-air;
+ int offsetC;
+ int offsetD;
+ if(ci0>=0)
+ {
+ pC += ci0/ps*ps*sdd;
+ offsetC = ci0%ps;
+ }
+ else
+ {
+ pC += -4*sdc;
+ offsetC = ps+ci0;
+ }
+ if(di0>=0)
+ {
+ pD += di0/ps*ps*sdd;
+ offsetD = di0%ps;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = ps+di0;
+ }
+
+ int i, j, l;
+
+ int idxB;
+
+ // clean up at the beginning
+ if(air!=0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ if(m>5)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ idxB += 4;
+ }
+ m -= 2*ps-air;
+ pA += 2*ps*sda;
+ pC += 2*ps*sdc;
+ pD += 2*ps*sdd;
+ }
+ else // m<=4
+ {
+#endif
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ idxB += 4;
+ }
+ m -= ps-air;
+ pA += ps*sda;
+ pC += ps*sdc;
+ pD += ps*sdd;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ // nothing more to do
+ return;
+ }
+#endif
+ }
+ i = 0;
+ // main loop
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+ if(i<m)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+// dgemm nn
+void dgemm_nn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ int air = ai & (ps-1);
+ int bir = bi & (ps-1);
+ double *pA = sA->pA + aj*ps + (ai-air)*sda;
+ double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+
+ int offsetB = bir;
+
+ int ci0 = ci-air;
+ int di0 = di-air;
+ int offsetC;
+ int offsetD;
+ if(ci0>=0)
+ {
+ pC += ci0/ps*ps*sdd;
+ offsetC = ci0%ps;
+ }
+ else
+ {
+ pC += -4*sdc;
+ offsetC = ps+ci0;
+ }
+ if(di0>=0)
+ {
+ pD += di0/ps*ps*sdd;
+ offsetD = di0%ps;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = ps+di0;
+ }
+
+ int i, j, l;
+
+ // clean up at the beginning
+ if(air!=0)
+ {
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ if(m>5)
+ {
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ }
+ m -= 2*ps-air;
+ pA += 2*ps*sda;
+ pC += 2*ps*sda;
+ pD += 2*ps*sda;
+ }
+ else // m-i<=4
+ {
+#endif
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[0], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ }
+ m -= 2*ps-air;
+ pA += 2*ps*sda;
+ pC += 2*ps*sda;
+ pD += 2*ps*sda;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ // nothing more to do
+ return;
+ }
+#endif
+ }
+ // main loop
+ i = 0;
+ if(offsetC==0 & offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+// kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+(i+8)*sdc], sdc, 0, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+ }
+ else
+ {
+// TODO 12x4
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+// kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+(i+8)*sdc], sdc, offsetD, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ return;
+
+ return;
+ }
+
+
+
+// dtrsm_nn_llu
+void dtrsm_llnu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ dtrsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_nn_lun
+void dtrsm_lunn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void dtrsm_rltn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ // TODO alpha !!!!!
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dA = sA->dA;
+
+ int i, j;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 0;
+ }
+
+// dtrsm_nt_rl_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pA[(j+4)*sda], sda, &pB[(j+4)*ps+i*sdb], sdb, &pD[(j+4)*ps+i*sdd], sdd, &pA[(j+4)*ps+(j+4)*sda], sda, &dA[(j+4)], m-i, n-(j+4));
+ }
+ if(j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pA[(j+4)*sda], &pB[(j+4)*ps+i*sdb], &pD[(j+4)*ps+i*sdd], &pA[(j+4)*ps+(j+4)*sda], &dA[(j+4)], m-i, n-(j+4));
+ j += 8;
+ }
+ else if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ j += 4;
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ }
+ if(j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ j += 8;
+ }
+ else if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ j += 4;
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void dtrsm_rltu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ dtrsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void dtrsm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void dtrmm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0)
+ {
+ printf("\ndtrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
+ exit(1);
+ }
+ const int ps = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ dtrmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void dtrmm_rlnn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
+ {
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ int air = ai & (ps-1);
+ int bir = bi & (ps-1);
+ double *pA = sA->pA + aj*ps + (ai-air)*sda;
+ double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+ double *pD = sD->pA + dj*ps;
+
+ int offsetB = bir;
+
+ int di0 = di-air;
+ int offsetD;
+ if(di0>=0)
+ {
+ pD += di0/ps*ps*sdd;
+ offsetD = di0%ps;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = ps+di0;
+ }
+
+ int ii, jj;
+
+ if(air!=0)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[jj*ps], sdd, air, air+m, 0, n-jj);
+ }
+ m -= ps-air;
+ pA += ps*sda;
+ pD += ps*sdd;
+ }
+ ii = 0;
+ if(offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-11; ii+=12)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_12x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd); // n-j>=6 !!!!!
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 12, n-jj);
+// kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 8, n-jj);
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[(ii+8)*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[(ii+8)*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ if(ii<m-8)
+ goto left_12;
+ else if(ii<m-4)
+ goto left_8;
+ else
+ goto left_4_gen;
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<m-7; ii+=8)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd);
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 8, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ if(ii<m-4)
+ goto left_8_gen;
+ else
+ goto left_4_gen;
+ }
+#else
+ for(; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ goto left_4_gen;
+ }
+#endif
+ }
+ else
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<m-4; ii+=8)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ goto left_4_gen;
+ }
+#else
+ for(; ii<m; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8_gen:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ return;
+#endif
+
+ left_4_gen:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void dsyrk_ln_libstr(int m, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0 | bi!=0)
+ {
+ printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
+ double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
+
+ // TODO ai and bi
+ int offsetC;
+ int offsetD;
+ offsetC = ci&(ps-1);
+ offsetD = di&(ps-1);
+
+ // main loop
+ i = 0;
+ if(offsetC==0 & offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+ }
+ else
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+ kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
+ }
+ if(m>i)
+ {
+ goto left_4_gen;
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ }
+ kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
+// kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
+ }
+ if(j<i-4)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ j += 4;
+ }
+ kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+// kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ }
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ }
+ if(j<i-4)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ j += 4;
+ }
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ }
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ return;
+#endif
+
+ left_4_gen:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ return;
+
+ }
+
+
+
+void dsyrk_ln_mn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ai!=0 | bi!=0)
+ {
+ printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
+ double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
+
+ // TODO ai and bi
+ int offsetC;
+ int offsetD;
+ offsetC = ci&(ps-1);
+ offsetD = di&(ps-1);
+
+ // main loop
+ i = 0;
+ if(offsetC==0 & offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-11)
+ {
+ kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
+ }
+ else
+ {
+ kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+ }
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-7)
+ {
+ kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
+ }
+ else
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ else
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+ }
+ else
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, n-j-4);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4_gen;
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+ }
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
+ j += 8;
+ }
+ if(j<i & j<n)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ }
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4_gen:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/d_blas_64.h b/blas/d_blas_64.h
new file mode 100644
index 0000000..8e6aba2
--- /dev/null
+++ b/blas/d_blas_64.h
@@ -0,0 +1,65 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void dcopy_(long long *m, double *x, long long *incx, double *y, long long *incy);
+void daxpy_(long long *m, double *alpha, double *x, long long *incx, double *y, long long *incy);
+void dscal_(long long *m, double *alpha, double *x, long long *incx);
+
+// level 2
+void dgemv_(char *ta, long long *m, long long *n, double *alpha, double *A, long long *lda, double *x, long long *incx, double *beta, double *y, long long *incy);
+void dsymv_(char *uplo, long long *m, double *alpha, double *A, long long *lda, double *x, long long *incx, double *beta, double *y, long long *incy);
+void dtrmv_(char *uplo, char *trans, char *diag, long long *n, double *A, long long *lda, double *x, long long *incx);
+void dtrsv_(char *uplo, char *trans, char *diag, long long *n, double *A, long long *lda, double *x, long long *incx);
+void dger_(long long *m, long long *n, double *alpha, double *x, long long *incx, double *y, long long *incy, double *A, long long *lda);
+
+// level 3
+void dgemm_(char *ta, char *tb, long long *m, long long *n, long long *k, double *alpha, double *A, long long *lda, double *B, long long *ldb, double *beta, double *C, long long *ldc);
+void dsyrk_(char *uplo, char *trans, long long *n, long long *k, double *alpha, double *A, long long *lda, double *beta, double *C, long long *ldc);
+void dtrmm_(char *side, char *uplo, char *trans, char *diag, long long *m, long long *n, double *alpha, double *A, long long *lda, double *B, long long *ldb);
+void dtrsm_(char *side, char *uplo, char *trans, char *diag, long long *m, long long *n, double *alpha, double *A, long long *lda, double *B, long long *ldb);
+
+// lapack
+long long dpotrf_(char *uplo, long long *m, double *A, long long *lda, long long *info);
+long long dgetrf_(long long *m, long long *n, double *A, long long *lda, long long *ipiv, long long *info);
+void dgeqrf_(long long *m, long long *n, double *A, long long *lda, double *tau, double *work, long long *lwork, long long *info);
+void dgeqr2_(long long *m, long long *n, double *A, long long *lda, double *tau, double *work, long long *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/d_lapack_lib.c b/blas/d_lapack_lib.c
new file mode 100644
index 0000000..ce68c3d
--- /dev/null
+++ b/blas/d_lapack_lib.c
@@ -0,0 +1,75 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "d_blas_64.h"
+#else
+#include "d_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GELQF_LIBSTR dgelqf_libstr
+#define GELQF_WORK_SIZE_LIBSTR dgelqf_work_size_libstr
+#define GEQRF_LIBSTR dgeqrf_libstr
+#define GEQRF_WORK_SIZE_LIBSTR dgeqrf_work_size_libstr
+#define GETF2_NOPIVOT dgetf2_nopivot
+#define GETRF_NOPIVOT_LIBSTR dgetrf_nopivot_libstr
+#define GETRF_LIBSTR dgetrf_libstr
+#define POTRF_L_LIBSTR dpotrf_l_libstr
+#define POTRF_L_MN_LIBSTR dpotrf_l_mn_libstr
+#define SYRK_POTRF_LN_LIBSTR dsyrk_dpotrf_ln_libstr
+
+#define COPY dcopy_
+#define GELQF dgelqf_
+#define GEMM dgemm_
+#define GER dger_
+#define GEQRF dgeqrf_
+#define GEQR2 dgeqr2_
+#define GETRF dgetrf_
+#define POTRF dpotrf_
+#define SCAL dscal_
+#define SYRK dsyrk_
+#define TRSM dtrsm_
+
+
+#include "x_lapack_lib.c"
diff --git a/blas/d_lapack_lib4.c b/blas/d_lapack_lib4.c
new file mode 100644
index 0000000..75a4a4f
--- /dev/null
+++ b/blas/d_lapack_lib4.c
@@ -0,0 +1,2671 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+
+
+void dgetrf_nn_nopivot_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int ii, jj, ie;
+
+ // main loop
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-11; ii+=12)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_12x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_l_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_m_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_m_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_r_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ if(m-ii<=4)
+ {
+ goto left_4;
+ }
+ else if(m-ii<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; ii<m-7; ii+=8)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_l_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+// kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
+// kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ if(jj<n-3)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ kernel_dgetrf_nn_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd],sdd, &pD[ii*ps+ii*sdd], sdd);
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ if(m-ii<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ goto left_4;
+ }
+
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ return;
+
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ return;
+
+#endif
+
+ left_4:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void dgetrf_nn_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D, int *ipiv)
+ {
+
+ if(m<=0)
+ return;
+
+ const int ps = 4;
+
+ int ii, jj, i0, i1, j0, ll, p;
+
+ double d1 = 1.0;
+ double dm1 = -1.0;
+
+ // needs to perform row-excanges on the yet-to-be-factorized matrix too
+ if(pC!=pD)
+ dgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
+
+ // minimum matrix size
+ p = n<m ? n : m; // XXX
+
+ // main loop
+#if defined(TARGET_X64_INTEL_HASWELL)
+ // 12 columns at a time
+ jj = 0;
+ for(; jj<p-11; jj+=12)
+ {
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // middle block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ ii += 4;
+ i1 = ii;
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+ }
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+ }
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+ }
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
+ ii += 8;
+ i1 = ii;
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+ }
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+ }
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+ }
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+ }
+
+ // solve upper
+// i0 -= 8; // 4 ???
+ ll = jj+12;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
+ }
+ if(ll<n)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 12, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ if(n-jj<=4)
+ goto left_n_4;
+ else if(n-jj<=8)
+ goto left_n_8;
+ else
+ goto left_n_12;
+ }
+ }
+ else // n>m
+ {
+ if(m-jj>0)
+ {
+ if(m-jj<=4)
+ goto left_m_4;
+ else if(m-jj<=8)
+ goto left_m_8;
+ else
+ goto left_m_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ // 8 columns at a time
+ jj = 0;
+ for(; jj<p-7; jj+=8)
+ {
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#else // SANDY_BRIDGE
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#endif
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ ii += 4;
+ i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#else // SANDY_BRIDGE
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#endif
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+ }
+
+ // solve upper
+ i0 -= 4;
+ ll = jj+8;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
+ }
+ if(ll<n)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 8, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ if(n-jj<=4) // (m>=1 && n==1) || (m>=2 && n==2) || m>=3 && n==3
+ {
+ goto left_n_4;
+ }
+ else // (m>=5 && n==5) || (m>=6 && n==6) || (m>=7 && n==7)
+ goto left_n_8;
+ }
+ }
+ else // n>m
+ {
+ if(m-jj>0)
+ {
+ if(m-jj<=4) // (m==1 && n>=2) || (m==2 && n>=3) || (m==3 && n>=4) || (m==4 && n>=5)
+ goto left_m_4;
+ else // (m==5 && n>=6) || (m==6 && n>=7) || (m==7 && n>=8)
+ {
+ goto left_m_8;
+ }
+ }
+ }
+#else
+ // 4 columns at a time
+ jj = 0;
+ for(; jj<p-3; jj+=4) // XXX
+ {
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) // XXX
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd]);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+#endif
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd]);
+ }
+ if(n-ll>0)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], 4, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ goto left_n_4;
+ }
+ }
+ else
+ {
+ if(m-jj>0)
+ {
+ goto left_m_4;
+ }
+ }
+#endif
+
+ // common return if jj==n
+ return;
+
+
+ // clean up
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_n_12:
+ // 9-12 columns at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-8; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+// ii+=8;
+ }
+ else if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+// ii+=4;
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // middle block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i1 = ii;
+ for( ; ii<m-8; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, n-jj-4);
+ }
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ else if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
+ ii += 8;
+ i1 = ii;
+ for( ; ii<m-8; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, n-jj-8);
+ }
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+ }
+ else if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+ }
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+ }
+ if(n-jj-8>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+ }
+ if(n-jj-8>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+ }
+ if(n-jj-8>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ // there is no upper
+ return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_m_12:
+ // 9-12 rows at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // middle block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i1 = ii;
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
+ ii += 8;
+ i1 = ii;
+ kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+ }
+ if(m-jj-8>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+ }
+ if(m-jj-8>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+ }
+ if(m-jj-8>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+// i0 -= 8;
+ ll = jj+12;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
+ }
+ return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_n_8:
+ // 5-8 columns at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-4; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+// ii+=4;
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i0 = ii;
+ for( ; ii<m-4; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], 0, sdd, m-ii, 0, n-jj-4);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ // there is no upper
+ return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_m_8:
+ // 5-8 rows at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i0 = ii;
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ i0 -= 4;
+ ll = jj+8;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
+ }
+ return;
+#endif
+
+
+ left_n_4:
+ // 1-4 columns at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+#if 0//defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX)
+ for( ; ii<m-4; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, 0, 0);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd], 0, 0);
+// ii+=4;
+ }
+#else
+ for( ; ii<m; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
+ }
+#endif
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ if(n-jj>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ if(n-jj>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ if(n-jj>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ if(0) // there is no upper
+ {
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
+ }
+ }
+ return;
+
+
+ left_m_4:
+ // 1-4 rows at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ if(m-i0>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ if(m-i0>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ if(m-i0>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
+ }
+ return;
+
+ }
+
+
+# if 0
+void dlauum_dpotrf_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ // TODO remove
+ int k = cv[nv-1];
+
+ const int ps = 4;
+
+ int i, j, l;
+ int ii, iii, jj, kii, kiii, kjj, k0, k1;
+
+ i = 0;
+ ii = 0;
+ iii = 0;
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-7; i+=8)
+ {
+
+ while(ii<nv && rv[ii]<i+8)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j]);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], 8, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], 8, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], 4, n-j-4); // TODO
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+
+ while(ii<nv && rv[ii]<i+4)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j]);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], m-i-4, n-j-4); // TODO
+ }
+ }
+ }
+ return;
+#endif
+
+ left_4:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ }
+ return;
+
+ }
+#endif
+
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dpotrf
+void dpotrf_l_libstr(int m, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndpotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA;
+
+ if(di==0 & dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12: // 9 - 12
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dpotrf_nt_l_8x8_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, m-j-4);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, m-(j+4));
+ }
+ if(j<i-4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, m-(j+4));
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ j += 4;
+ }
+ kernel_dpotrf_nt_l_8x8_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ return;
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, m-j-4);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ }
+ if(j<i-4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ j += 4;
+ }
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ return;
+#endif
+
+ }
+
+
+
+// dpotrf
+void dpotrf_l_mn_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndpotrf_l_mn_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA;
+
+ if(di==0 & dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpptrf
+ {
+ if(n<j-11)
+ {
+ kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+ }
+ else
+ {
+ kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+ }
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ if(j<n-7)
+// if(0)
+ {
+ kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+ }
+ else
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ if(j<n-3)
+ {
+ kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ else
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+ }
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+// dsyrk dpotrf
+void dsyrk_dpotrf_ln_libstr(int m, int n, int k, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\ndsyrk_dpotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA; // XXX what to do if di and dj are not zero
+
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-11)
+ {
+ kernel_dsyrk_dpotrf_nt_l_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dsyrk_dpotrf_nt_l_8x8_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ if(j<n-8)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-7)
+// if(0)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+ }
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(k, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, (j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
+ }
+ if(j<i-3 & j<n-3)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[(j+4)*sdb], (j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+#else
+ left_4:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+// dgetrf without pivoting
+void dgetrf_nopivot_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA; // XXX what to do if di and dj are not zero
+ dgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+
+// dgetrf pivoting
+void dgetrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, int *ipiv)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA; // XXX what to do if di and dj are not zero
+ dgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+int dgeqrf_work_size_libstr(int m, int n)
+ {
+ const int ps = 4;
+ int cm = (m+ps-1)/ps*ps;
+ int cn = (n+ps-1)/ps*ps;
+ return ps*(cm+cn)*sizeof(double);
+// return 0;
+ }
+
+
+
+void dgeqrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *v_work)
+ {
+ char *work = (char *) v_work;
+ if(m<=0 | n<=0)
+ return;
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
+ double *pD = &(DMATEL_LIBSTR(sD,di,dj));
+ double *dD = sD->dA + di;
+ int cm = (m+ps-1)/ps*ps;
+ int cn = (n+ps-1)/ps*ps;
+ double *pVt = (double *) work;
+ work += ps*cm*sizeof(double);
+ double *pW = (double *) work;
+ work += ps*cn*sizeof(double);
+ if(pC!=pD)
+ dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
+ int ii;
+ int imax0 = (ps-(di&(ps-1)))&(ps-1);
+ int imax = m<n ? m : n;
+ imax0 = imax<imax0 ? imax : imax0;
+ if(imax0>0)
+ {
+ kernel_dgeqrf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
+ pD += imax0-ps+ps*sdd+imax0*ps;
+ dD += imax0;
+ m -= imax0;
+ n -= imax0;
+ imax -= imax0;
+ }
+ for(ii=0; ii<imax-3; ii+=4)
+ {
+ kernel_dgeqrf_4_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, dD+ii);
+#if 0
+ kernel_dlarf_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd);
+#else
+ kernel_dgetr_4_0_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, pVt);
+ pVt[0+ps*0] = 1.0;
+ pVt[1+ps*0] = 0.0;
+ pVt[2+ps*0] = 0.0;
+ pVt[3+ps*0] = 0.0;
+ pVt[1+ps*1] = 1.0;
+ pVt[2+ps*1] = 0.0;
+ pVt[3+ps*1] = 0.0;
+ pVt[2+ps*2] = 1.0;
+ pVt[3+ps*2] = 0.0;
+ pVt[3+ps*3] = 1.0;
+ kernel_dlarf_t_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, pVt, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd, pW);
+#endif
+ }
+ if(ii<imax)
+ {
+ kernel_dgeqrf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+ }
+ return;
+ }
+
+
+
+int dgelqf_work_size_libstr(int m, int n)
+ {
+ return 0;
+ }
+
+
+
+void dgelqf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
+ double *pD = &(DMATEL_LIBSTR(sD,di,dj));
+ double *dD = sD->dA + di;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ double pT[144] __attribute__ ((aligned (64))) = {0};
+ double pK[96] __attribute__ ((aligned (64))) = {0};
+#else
+ double pT[144] = {0};
+ double pK[96] = {0};
+#endif
+ if(pC!=pD)
+ dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
+ int ii, jj, ll;
+ int imax0 = (ps-(di&(ps-1)))&(ps-1);
+ int imax = m<n ? m : n;
+#if 0
+ kernel_dgelqf_vs_lib4(m, n, imax, di&(ps-1), pD, sdd, dD);
+#else
+ imax0 = imax<imax0 ? imax : imax0;
+ if(imax0>0)
+ {
+ kernel_dgelqf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
+ pD += imax0-ps+ps*sdd+imax0*ps;
+ dD += imax0;
+ m -= imax0;
+ n -= imax0;
+ imax -= imax0;
+ }
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+// for(; ii<imax-11; ii+=12)
+ for(; ii<imax-127; ii+=12) // crossover point ~ ii=128
+ {
+ kernel_dgelqf_dlarft12_12_lib4(n-(ii+0), pD+(ii+0)*sdd+(ii+0)*ps, sdd, dD+(ii+0), &pT[0+0*12+0*ps]);
+ jj = ii+12;
+ for(; jj<m; jj+=4)
+ {
+ kernel_dlarfb12_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, pT, pD+jj*sdd+ii*ps, pK, m-jj);
+ }
+ }
+ for(; ii<imax-11; ii+=4)
+ {
+ kernel_dgelqf_dlarft4_12_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
+ jj = ii+12;
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ // 8 9 10 11
+ if(ii<imax-7)
+ {
+ kernel_dgelqf_dlarft4_8_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
+ jj = ii+8;
+ if(jj<m)
+ {
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ ii += 4;
+ }
+ // 4 5 6 7
+ if(ii<imax-3)
+ {
+ kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+ jj = ii+4;
+ if(jj<m)
+ {
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ ii += 4;
+ }
+ // 1 2 3
+ if(ii<imax)
+ {
+ kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+ }
+#else // no haswell
+ for(ii=0; ii<imax-4; ii+=4)
+ {
+// kernel_dgelqf_vs_lib4(4, n-ii, 4, 0, pD+ii*sdd+ii*ps, sdd, dD+ii);
+// kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
+// kernel_dlarft_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+ kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+ jj = ii+4;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+#endif
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ if(ii<imax)
+ {
+ if(ii==imax-4)
+ {
+ kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
+ }
+ else
+ {
+ kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+ }
+ }
+#endif // no haswell
+#endif
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas.h b/blas/s_blas.h
new file mode 100644
index 0000000..b6a92a7
--- /dev/null
+++ b/blas/s_blas.h
@@ -0,0 +1,66 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void scopy_(int *m, float *x, int *incx, float *y, int *incy);
+void saxpy_(int *m, float *alpha, float *x, int *incx, float *y, int *incy);
+void sscal_(int *m, float *alpha, float *x, int *incx);
+
+// level 2
+void sgemv_(char *ta, int *m, int *n, float *alpha, float *A, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+void ssymv_(char *uplo, int *m, float *alpha, float *A, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+void strmv_(char *uplo, char *trans, char *diag, int *n, float *A, int *lda, float *x, int *incx);
+void strsv_(char *uplo, char *trans, char *diag, int *n, float *A, int *lda, float *x, int *incx);
+void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *A, int *lda);
+
+// level 3
+void sgemm_(char *ta, char *tb, int *m, int *n, int *k, float *alpha, float *A, int *lda, float *B, int *ldb, float *beta, float *C, int *ldc);
+void ssyrk_(char *uplo, char *trans, int *n, int *k, float *alpha, float *A, int *lda, float *beta, float *C, int *ldc);
+void strmm_(char *side, char *uplo, char *transa, char *diag, int *m, int *n, float *alpha, float *A, int *lda, float *B, int *ldb);
+void strsm_(char *side, char *uplo, char *transa, char *diag, int *m, int *n, float *alpha, float *A, int *lda, float *B, int *ldb);
+
+// lapack
+int spotrf_(char *uplo, int *m, float *A, int *lda, int *info);
+int sgetrf_(int *m, int *n, float *A, int *lda, int *ipiv, int *info);
+void sgeqrf_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *lwork, int *info);
+void sgeqr2_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *info);
+void sgelqf_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *lwork, int *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/s_blas1_lib.c b/blas/s_blas1_lib.c
new file mode 100644
index 0000000..67fec77
--- /dev/null
+++ b/blas/s_blas1_lib.c
@@ -0,0 +1,54 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "s_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#define REAL float
+
+#define STRVEC s_strvec
+
+#define AXPY_LIBSTR saxpy_libstr
+#define VECMULDOT_LIBSTR svecmuldot_libstr
+#define DOT_LIBSTR sdot_libstr
+
+#define AXPY saxpy_
+#define COPY scopy_
+
+
+#include "x_blas1_lib.c"
+
diff --git a/blas/s_blas1_lib4.c b/blas/s_blas1_lib4.c
new file mode 100644
index 0000000..8588020
--- /dev/null
+++ b/blas/s_blas1_lib4.c
@@ -0,0 +1,123 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// z = y + alpha*x, with increments equal to 1
+void saxpy_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+void saxpy_bkp_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1];
+ y[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2];
+ y[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3];
+ y[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ float dot = 0.0;
+
+ ii = 0;
+
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/blas/s_blas1_lib8.c b/blas/s_blas1_lib8.c
new file mode 100644
index 0000000..538c012
--- /dev/null
+++ b/blas/s_blas1_lib8.c
@@ -0,0 +1,124 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// z = y + alpha*x, with increments equal to 1
+void saxpy_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+void saxpy_bkp_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1];
+ y[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2];
+ y[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3];
+ y[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ float dot = 0.0;
+
+ ii = 0;
+
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/s_blas2_diag_lib.c b/blas/s_blas2_diag_lib.c
new file mode 100644
index 0000000..1dde42f
--- /dev/null
+++ b/blas/s_blas2_diag_lib.c
@@ -0,0 +1,46 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL float
+
+#define STRVEC s_strvec
+
+#define GEMV_DIAG_LIBSTR sgemv_diag_libstr
+
+
+
+#include "x_blas2_diag_lib.c"
+
diff --git a/blas/s_blas2_lib.c b/blas/s_blas2_lib.c
new file mode 100644
index 0000000..7ab8dc2
--- /dev/null
+++ b/blas/s_blas2_lib.c
@@ -0,0 +1,72 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "s_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GEMV_N_LIBSTR sgemv_n_libstr
+#define GEMV_NT_LIBSTR sgemv_nt_libstr
+#define GEMV_T_LIBSTR sgemv_t_libstr
+#define SYMV_L_LIBSTR ssymv_l_libstr
+#define TRMV_LNN_LIBSTR strmv_lnn_libstr
+#define TRMV_LTN_LIBSTR strmv_ltn_libstr
+#define TRMV_UNN_LIBSTR strmv_unn_libstr
+#define TRMV_UTN_LIBSTR strmv_utn_libstr
+#define TRSV_LNN_LIBSTR strsv_lnn_libstr
+#define TRSV_LNN_MN_LIBSTR strsv_lnn_mn_libstr
+#define TRSV_LNU_LIBSTR strsv_lnu_libstr
+#define TRSV_LTN_LIBSTR strsv_ltn_libstr
+#define TRSV_LTN_MN_LIBSTR strsv_ltn_mn_libstr
+#define TRSV_LTU_LIBSTR strsv_ltu_libstr
+#define TRSV_UNN_LIBSTR strsv_unn_libstr
+#define TRSV_UTN_LIBSTR strsv_utn_libstr
+
+#define COPY scopy_
+#define GEMV sgemv_
+#define SYMV ssymv_
+#define TRMV strmv_
+#define TRSV strsv_
+
+
+
+#include "x_blas2_lib.c"
+
diff --git a/blas/s_blas2_lib4.c b/blas/s_blas2_lib4.c
new file mode 100644
index 0000000..b7a947d
--- /dev/null
+++ b/blas/s_blas2_lib4.c
@@ -0,0 +1,1045 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ kernel_sgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+ pA += bs*sda;
+ y += 4 - ai%bs;
+ z += 4 - ai%bs;
+ m -= 4 - ai%bs;
+ }
+ // main loop
+ for( ; i<m-3; i+=4)
+ {
+ kernel_sgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ if(ai%bs==0)
+ {
+ i = 0;
+ for( ; i<n-3; i+=4)
+ {
+ kernel_sgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<n)
+ {
+ kernel_sgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+ else // TODO kernel 8
+ {
+ i = 0;
+ for( ; i<n; i+=4)
+ {
+ kernel_sgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
+ {
+
+ if(ai!=0)
+ {
+ printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x_n = sx_n->pa + xi_n;
+ float *x_t = sx_t->pa + xi_t;
+ float *y_n = sy_n->pa + yi_n;
+ float *y_t = sy_t->pa + yi_t;
+ float *z_n = sz_n->pa + zi_n;
+ float *z_t = sz_t->pa + zi_t;
+
+// sgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
+
+// if(m<=0 | n<=0)
+// return;
+
+ int ii;
+
+ // copy and scale y_n int z_n
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ z_n[ii+1] = beta_n*y_n[ii+1];
+ z_n[ii+2] = beta_n*y_n[ii+2];
+ z_n[ii+3] = beta_n*y_n[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ }
+
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_sgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+ if(ii<n)
+ {
+ kernel_sgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+ }
+
+ return;
+ }
+
+
+
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, n1;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ // copy and scale y int z
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = beta*y[ii+0];
+ z[ii+1] = beta*y[ii+1];
+ z[ii+2] = beta*y[ii+2];
+ z[ii+3] = beta*y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = beta*y[ii+0];
+ }
+
+ // clean up at the beginning
+ if(ai%bs!=0) // 1, 2, 3
+ {
+ n1 = 4-ai%bs;
+ kernel_ssymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
+ pA += n1 + n1*bs + (sda-1)*bs;
+ x += n1;
+ z += n1;
+ m -= n1;
+ n -= n1;
+ }
+ // main loop
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_ssymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
+ }
+ // clean up at the end
+ if(ii<n)
+ {
+ kernel_ssymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
+ }
+
+ return;
+ }
+
+
+
+// m >= n
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ if(m-n>0)
+ sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+ float *pA2 = pA;
+ float *z2 = z;
+ int m2 = n;
+ int n2 = 0;
+ float *pA3, *x3;
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ float zt[4];
+
+ int ii, jj, jj_end;
+
+ ii = 0;
+
+ if(ai%4!=0)
+ {
+ pA2 += sda*bs - ai%bs;
+ z2 += bs-ai%bs;
+ m2 -= bs-ai%bs;
+ n2 += bs-ai%bs;
+ }
+
+ pA2 += m2/bs*bs*sda;
+ z2 += m2/bs*bs;
+ n2 += m2/bs*bs;
+
+ if(m2%bs!=0)
+ {
+ //
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ zt[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
+ for(jj=0; jj<m2%bs; jj++)
+ z2[jj] = zt[jj];
+ }
+ for(; ii<m2-3; ii+=4)
+ {
+ pA2 -= bs*sda;
+ z2 -= 4;
+ n2 -= 4;
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ z2[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
+ }
+ if(ai%4!=0)
+ {
+ if(ai%bs==1)
+ {
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==2)
+ {
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else // if (ai%bs==3)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ float xt[4];
+ float zt[4];
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ int ii, jj, ll, ll_max;
+
+ jj = 0;
+
+ if(ai%bs!=0)
+ {
+
+ if(ai%bs==1)
+ {
+ ll_max = m-jj<3 ? m-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<3; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+ zt[2] = pA[2+bs*2]*xt[2];
+ pA += bs*sda - 1;
+ x += 3;
+ kernel_sgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<3 ? n-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*3;
+ z += 3;
+ jj += 3;
+ }
+ else if(ai%bs==2)
+ {
+ ll_max = m-jj<2 ? m-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<2; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+ zt[1] = pA[1+bs*1]*xt[1];
+ pA += bs*sda - 2;
+ x += 2;
+ kernel_sgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<2 ? n-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*2;
+ z += 2;
+ jj += 2;
+ }
+ else // if(ai%bs==3)
+ {
+ ll_max = m-jj<1 ? m-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<1; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0];
+ pA += bs*sda - 3;
+ x += 1;
+ kernel_sgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<1 ? n-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*1;
+ z += 1;
+ jj += 1;
+ }
+
+ }
+
+ for(; jj<n-3; jj+=4)
+ {
+ zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
+ zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
+ zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
+ zt[3] = pA[3+bs*3]*x[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_sgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
+ pA += bs*4;
+ z += 4;
+ }
+ if(jj<n)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_sgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*4;
+// z += 4;
+ }
+
+ return;
+
+ }
+
+
+
+void strmv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int i;
+
+ i=0;
+ for(; i<m-3; i+=4)
+ {
+ kernel_strmv_un_4_lib4(m-i, pA, x, z);
+ pA += 4*sda+4*bs;
+ x += 4;
+ z += 4;
+ }
+ if(m>i)
+ {
+ if(m-i==1)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ else if(m-i==2)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
+ z[1] = pA[1+bs*1]*x[1];
+ }
+ else // if(m-i==3)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
+ z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
+ z[2] = pA[2+bs*2]*x[2];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void strmv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\nstrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii, idx;
+
+ float *ptrA;
+
+ ii=0;
+ idx = m/bs*bs;
+ if(m%bs!=0)
+ {
+ kernel_strmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
+ ii += m%bs;
+ }
+ idx -= 4;
+ for(; ii<m; ii+=4)
+ {
+ kernel_strmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
+ idx -= 4;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<m-3; i+=4)
+ {
+ kernel_strsv_ln_inv_4_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_strsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
+ i+=4;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(m<n)
+ m = n;
+
+ float alpha = -1.0;
+ float beta = 1.0;
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<n-3; i+=4)
+ {
+ kernel_strsv_ln_inv_4_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<n)
+ {
+ kernel_strsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
+ i+=4;
+ }
+ for( ; i<m-3; i+=4)
+ {
+ kernel_sgemv_n_4_lib4(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
+ i+=4;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ if(m%4==1)
+ {
+ kernel_strsv_lt_inv_1_lib4(i+1, &pA[m/bs*bs*sda+(m-i-1)*bs], sda, &dA[m-i-1], &z[m-i-1], &z[m-i-1], &z[m-i-1]);
+ i++;
+ }
+ else if(m%4==2)
+ {
+ kernel_strsv_lt_inv_2_lib4(i+2, &pA[m/bs*bs*sda+(m-i-2)*bs], sda, &dA[m-i-2], &z[m-i-2], &z[m-i-2], &z[m-i-2]);
+ i+=2;
+ }
+ else if(m%4==3)
+ {
+ kernel_strsv_lt_inv_3_lib4(i+3, &pA[m/bs*bs*sda+(m-i-3)*bs], sda, &dA[m-i-3], &z[m-i-3], &z[m-i-3], &z[m-i-3]);
+ i+=3;
+ }
+ for(; i<m-3; i+=4)
+ {
+ kernel_strsv_lt_inv_4_lib4(i+4, &pA[(m-i-4)/bs*bs*sda+(m-i-4)*bs], sda, &dA[m-i-4], &z[m-i-4], &z[m-i-4], &z[m-i-4]);
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(n>m)
+ n = m;
+
+ int i;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ if(n%4==1)
+ {
+ kernel_strsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &dA[n-i-1], &z[n-i-1], &z[n-i-1], &z[n-i-1]);
+ i++;
+ }
+ else if(n%4==2)
+ {
+ kernel_strsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &dA[n-i-2], &z[n-i-2], &z[n-i-2], &z[n-i-2]);
+ i+=2;
+ }
+ else if(n%4==3)
+ {
+ kernel_strsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &dA[n-i-3], &z[n-i-3], &z[n-i-3], &z[n-i-3]);
+ i+=3;
+ }
+ for(; i<n-3; i+=4)
+ {
+ kernel_strsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &dA[n-i-4], &z[n-i-4], &z[n-i-4], &z[n-i-4]);
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_lnu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void strsv_ltu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_ltu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void strsv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_unn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void strsv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_utn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas2_lib8.c b/blas/s_blas2_lib8.c
new file mode 100644
index 0000000..41a78c4
--- /dev/null
+++ b/blas/s_blas2_lib8.c
@@ -0,0 +1,1008 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<0)
+ return;
+
+ const int bs = 8;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ kernel_sgemv_n_8_gen_lib8(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+ pA += bs*sda;
+ y += 8 - ai%bs;
+ z += 8 - ai%bs;
+ m -= 8 - ai%bs;
+ }
+ // main loop
+ for( ; i<m-7; i+=8)
+ {
+ kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(n<=0)
+ return;
+
+ const int bs = 8;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ if(ai%bs==0)
+ {
+ i = 0;
+ for( ; i<n-7; i+=8)
+ {
+ kernel_sgemv_t_8_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<n)
+ {
+ if(n-i<=4)
+ {
+ kernel_sgemv_t_4_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ else
+ {
+ kernel_sgemv_t_8_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+ }
+ else
+ {
+ i = 0;
+ for( ; i<n-4; i+=8)
+ {
+ kernel_sgemv_t_8_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ if(i<n)
+ {
+ kernel_sgemv_t_4_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ if(m-n>0)
+ sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+ float *pA2 = pA;
+ float *z2 = z;
+ int m2 = n;
+ int n2 = 0;
+ float *pA3, *x3;
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ float zt[8];
+
+ int ii, jj, jj_end;
+
+ ii = 0;
+
+ if(ai%bs!=0)
+ {
+ pA2 += sda*bs - ai%bs;
+ z2 += bs-ai%bs;
+ m2 -= bs-ai%bs;
+ n2 += bs-ai%bs;
+ }
+
+ pA2 += m2/bs*bs*sda;
+ z2 += m2/bs*bs;
+ n2 += m2/bs*bs;
+
+ if(m2%bs!=0)
+ {
+ //
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ zt[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
+ zt[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
+ zt[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
+ zt[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
+ zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ zt[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, zt, zt);
+ for(jj=0; jj<m2%bs; jj++)
+ z2[jj] = zt[jj];
+ }
+ for(; ii<m2-7; ii+=8)
+ {
+ pA2 -= bs*sda;
+ z2 -= 8;
+ n2 -= 8;
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ z2[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
+ z2[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
+ z2[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
+ z2[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
+ z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ z2[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, z2, z2);
+ }
+ if(ai%bs!=0)
+ {
+ if(ai%bs==1)
+ {
+ zt[6] = pA[6+bs*0]*x[0] + pA[6+bs*1]*x[1] + pA[6+bs*2]*x[2] + pA[6+bs*3]*x[3] + pA[6+bs*4]*x[4] + pA[6+bs*5]*x[5] + pA[6+bs*6]*x[6];
+ zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
+ zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==2)
+ {
+ zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
+ zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==3)
+ {
+ zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==4)
+ {
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==5)
+ {
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==6)
+ {
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else // if (ai%bs==7)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ float xt[8];
+ float zt[8];
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ int ii, jj, ll, ll_max;
+
+ jj = 0;
+
+ if(ai%bs!=0)
+ {
+
+ if(ai%bs==1)
+ {
+ ll_max = m-jj<7 ? m-jj : 7;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<7; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6];
+ zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6];
+ zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6];
+ zt[6] = pA[6+bs*6]*xt[6];
+ pA += bs*sda - 1;
+ x += 7;
+ kernel_sgemv_t_8_lib8(m-7-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<7 ? n-jj : 7;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*7;
+ z += 7;
+ jj += 7;
+ }
+ else if(ai%bs==2)
+ {
+ ll_max = m-jj<6 ? m-jj : 6;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<6; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5];
+ zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5];
+ zt[5] = pA[5+bs*5]*xt[5];
+ pA += bs*sda - 2;
+ x += 6;
+ kernel_sgemv_t_8_lib8(m-6-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<6 ? n-jj : 6;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*6;
+ z += 6;
+ jj += 6;
+ }
+ else if(ai%bs==3)
+ {
+ ll_max = m-jj<5 ? m-jj : 5;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<5; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4];
+ zt[4] = pA[4+bs*4]*xt[4];
+ pA += bs*sda - 3;
+ x += 5;
+ kernel_sgemv_t_8_lib8(m-5-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<5 ? n-jj : 5;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*5;
+ z += 5;
+ jj += 5;
+ }
+ else if(ai%bs==4)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda - 4;
+ x += 4;
+ kernel_sgemv_t_8_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<4 ? n-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*4;
+ z += 4;
+ jj += 4;
+ }
+ else if(ai%bs==5)
+ {
+ ll_max = m-jj<3 ? m-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<3; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+ zt[2] = pA[2+bs*2]*xt[2];
+ pA += bs*sda - 5;
+ x += 3;
+ kernel_sgemv_t_8_lib8(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<3 ? n-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*3;
+ z += 3;
+ jj += 3;
+ }
+ else if(ai%bs==6)
+ {
+ ll_max = m-jj<2 ? m-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<2; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+ zt[1] = pA[1+bs*1]*xt[1];
+ pA += bs*sda - 6;
+ x += 2;
+ kernel_sgemv_t_8_lib8(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<2 ? n-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*2;
+ z += 2;
+ jj += 2;
+ }
+ else // if(ai%bs==7)
+ {
+ ll_max = m-jj<1 ? m-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<1; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0];
+ pA += bs*sda - 7;
+ x += 1;
+ kernel_sgemv_t_8_lib8(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<1 ? n-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*1;
+ z += 1;
+ jj += 1;
+ }
+
+ }
+
+ for(; jj<n-7; jj+=8)
+ {
+ zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3] + pA[4+bs*0]*x[4] + pA[5+bs*0]*x[5] + pA[6+bs*0]*x[6] + pA[7+bs*0]*x[7];
+ zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3] + pA[4+bs*1]*x[4] + pA[5+bs*1]*x[5] + pA[6+bs*1]*x[6] + pA[7+bs*1]*x[7];
+ zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3] + pA[4+bs*2]*x[4] + pA[5+bs*2]*x[5] + pA[6+bs*2]*x[6] + pA[7+bs*2]*x[7];
+ zt[3] = pA[3+bs*3]*x[3] + pA[4+bs*3]*x[4] + pA[5+bs*3]*x[5] + pA[6+bs*3]*x[6] + pA[7+bs*3]*x[7];
+ zt[4] = pA[4+bs*4]*x[4] + pA[5+bs*4]*x[5] + pA[6+bs*4]*x[6] + pA[7+bs*4]*x[7];
+ zt[5] = pA[5+bs*5]*x[5] + pA[6+bs*5]*x[6] + pA[7+bs*5]*x[7];
+ zt[6] = pA[6+bs*6]*x[6] + pA[7+bs*6]*x[7];
+ zt[7] = pA[7+bs*7]*x[7];
+ pA += bs*sda;
+ x += 8;
+ kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, z);
+ pA += bs*8;
+ z += 8;
+ }
+ if(jj<n)
+ {
+ if(n-jj<=4)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_sgemv_t_4_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*4;
+// z += 4;
+ }
+ else
+ {
+ ll_max = m-jj<8 ? m-jj : 8;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<8; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6] + pA[7+bs*0]*xt[7];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6] + pA[7+bs*1]*xt[7];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6] + pA[7+bs*2]*xt[7];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6] + pA[7+bs*3]*xt[7];
+ zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6] + pA[7+bs*4]*xt[7];
+ zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6] + pA[7+bs*5]*xt[7];
+ zt[6] = pA[6+bs*6]*xt[6] + pA[7+bs*6]*xt[7];
+ zt[7] = pA[7+bs*7]*xt[7];
+ pA += bs*sda;
+ x += 8;
+ kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*8;
+// z += 8;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<m-7; i+=8)
+ {
+ kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
+ i+=8;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(m<n)
+ m = n;
+
+ float alpha = -1.0;
+ float beta = 1.0;
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<n-7; i+=8)
+ {
+ kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<n)
+ {
+ kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
+ i+=8;
+ }
+ for( ; i<m-7; i+=8)
+ {
+ kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
+ i+=8;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i, i1;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ i1 = m%8;
+ if(i1!=0)
+ {
+ kernel_strsv_lt_inv_8_vs_lib8(i+i1, &pA[m/bs*bs*sda+(m-i-i1)*bs], sda, &dA[m-i-i1], &z[m-i-i1], &z[m-i-i1], &z[m-i-i1], i1, i1);
+ i += i1;
+ }
+ for(; i<m-7; i+=8)
+ {
+ kernel_strsv_lt_inv_8_lib8(i+8, &pA[(m-i-8)/bs*bs*sda+(m-i-8)*bs], sda, &dA[m-i-8], &z[m-i-8], &z[m-i-8], &z[m-i-8]);
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(n>m)
+ n = m;
+
+ int i, i1;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ i1 = n%8;
+ if(i1!=0)
+ {
+ kernel_strsv_lt_inv_8_vs_lib8(m-n+i1, &pA[n/bs*bs*sda+(n-i1)*bs], sda, &dA[n-i1], &z[n-i1], &z[n-i1], &z[n-i1], m-n+i1, i1);
+ i += i1;
+ }
+ for(; i<n-7; i+=8)
+ {
+ kernel_strsv_lt_inv_8_lib8(m-n+i+8, &pA[(n-i-8)/bs*bs*sda+(n-i-8)*bs], sda, &dA[n-i-8], &z[n-i-8], &z[n-i-8], &z[n-i-8]);
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
+ {
+
+ if(ai!=0)
+ {
+ printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x_n = sx_n->pa + xi_n;
+ float *x_t = sx_t->pa + xi_t;
+ float *y_n = sy_n->pa + yi_n;
+ float *y_t = sy_t->pa + yi_t;
+ float *z_n = sz_n->pa + zi_n;
+ float *z_t = sz_t->pa + zi_t;
+
+// if(m<=0 | n<=0)
+// return;
+
+ int ii;
+
+ // copy and scale y_n int z_n
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ z_n[ii+1] = beta_n*y_n[ii+1];
+ z_n[ii+2] = beta_n*y_n[ii+2];
+ z_n[ii+3] = beta_n*y_n[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ }
+
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_sgemv_nt_4_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+ if(ii<n)
+ {
+ kernel_sgemv_nt_4_vs_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+ }
+
+ return;
+ }
+
+
+
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+// if(m<=0 | n<=0)
+// return;
+
+ const int bs = 8;
+
+ int ii, n1, n2;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ // copy and scale y int z
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = beta*y[ii+0];
+ z[ii+1] = beta*y[ii+1];
+ z[ii+2] = beta*y[ii+2];
+ z[ii+3] = beta*y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = beta*y[ii+0];
+ }
+
+ // clean up at the beginning
+ if(ai%bs!=0) // 1, 2, 3
+ {
+ n1 = 8-ai%bs;
+ n2 = n<n1 ? n : n1;
+ kernel_ssymv_l_4l_gen_lib8(m-0, &alpha, ai%bs, &pA[0+(0)*bs], sda, &x[0], &z[0], n2-0);
+ kernel_ssymv_l_4r_gen_lib8(m-4, &alpha, ai%bs, &pA[4+(4)*bs], sda, &x[4], &z[4], n2-4);
+ pA += n1 + n1*bs + (sda-1)*bs;
+ x += n1;
+ z += n1;
+ m -= n1;
+ n -= n1;
+ }
+ // main loop
+ ii = 0;
+ for(; ii<n-7; ii+=8)
+ {
+ kernel_ssymv_l_4l_lib8(m-ii-0, &alpha, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0]);
+ kernel_ssymv_l_4r_lib8(m-ii-4, &alpha, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4]);
+ }
+ // clean up at the end
+ if(ii<n)
+ {
+ kernel_ssymv_l_4l_gen_lib8(m-ii-0, &alpha, 0, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0], n-ii-0);
+ kernel_ssymv_l_4r_gen_lib8(m-ii-4, &alpha, 0, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4], n-ii-4);
+ }
+
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas3_diag_lib.c b/blas/s_blas3_diag_lib.c
new file mode 100644
index 0000000..23f8e0f
--- /dev/null
+++ b/blas/s_blas3_diag_lib.c
@@ -0,0 +1,49 @@
+
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GEMM_L_DIAG_LIBSTR sgemm_l_diag_libstr
+#define GEMM_R_DIAG_LIBSTR sgemm_r_diag_libstr
+
+
+
+#include "x_blas3_diag_lib.c"
+
diff --git a/blas/s_blas3_diag_lib4.c b/blas/s_blas3_diag_lib4.c
new file mode 100644
index 0000000..0319212
--- /dev/null
+++ b/blas/s_blas3_diag_lib4.c
@@ -0,0 +1,161 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void sgemm_l_diag_libstr(int m, int n, float alpha, struct s_strvec *sA, int ai, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_l_diag_libstr: feature not implemented yet: bi=%d, ci=%d, di=%d\n", bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *dA = sA->pa + ai;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+// sgemm_diag_left_lib(m, n, alpha, dA, pB, sdb, beta, pC, sdc, pD, sdd);
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgemm_diag_left_4_a0_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &pD[ii*sdd]);
+ }
+ }
+ else
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgemm_diag_left_4_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii==1)
+ kernel_sgemm_diag_left_1_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else if(m-ii==2)
+ kernel_sgemm_diag_left_2_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else // if(m-ii==3)
+ kernel_sgemm_diag_left_3_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+
+ return;
+
+ }
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ai!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *dB = sB->pa + bi;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+ }
+ }
+ else
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ }
+ if(n-ii>0)
+ {
+ if(n-ii==1)
+ kernel_sgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else if(n-ii==2)
+ kernel_sgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else // if(n-ii==3)
+ kernel_sgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_diag_lib8.c b/blas/s_blas3_diag_lib8.c
new file mode 100644
index 0000000..8469345
--- /dev/null
+++ b/blas/s_blas3_diag_lib8.c
@@ -0,0 +1,105 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ai!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *dB = sB->pa + bi;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+ }
+ }
+ else
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ }
+ if(n-ii>0)
+ {
+ if(n-ii==1)
+ kernel_sgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else if(n-ii==2)
+ kernel_sgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else // if(n-ii==3)
+ kernel_sgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_lib.c b/blas/s_blas3_lib.c
new file mode 100644
index 0000000..dca98ff
--- /dev/null
+++ b/blas/s_blas3_lib.c
@@ -0,0 +1,70 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "s_blas_64.h"
+#else
+#include "s_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+
+#define GEMM_NN_LIBSTR sgemm_nn_libstr
+#define GEMM_NT_LIBSTR sgemm_nt_libstr
+#define SYRK_LN_LIBSTR ssyrk_ln_libstr
+#define SYRK_LN_MN_LIBSTR ssyrk_ln_mn_libstr
+#define TRMM_RLNN_LIBSTR strmm_rlnn_libstr
+#define TRMM_RUTN_LIBSTR strmm_rutn_libstr
+#define TRSM_LLNU_LIBSTR strsm_llnu_libstr
+#define TRSM_LUNN_LIBSTR strsm_lunn_libstr
+#define TRSM_RLTN_LIBSTR strsm_rltn_libstr
+#define TRSM_RLTU_LIBSTR strsm_rltu_libstr
+#define TRSM_RUTN_LIBSTR strsm_rutn_libstr
+
+#define COPY scopy_
+#define GEMM sgemm_
+#define SYRK ssyrk_
+#define TRMM strmm_
+#define TRSM strsm_
+
+
+
+#include "x_blas3_lib.c"
+
diff --git a/blas/s_blas3_lib4.c b/blas/s_blas3_lib4.c
new file mode 100644
index 0000000..c6be38f
--- /dev/null
+++ b/blas/s_blas3_lib4.c
@@ -0,0 +1,1062 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void sgemm_nt_lib(int m, int n, int k, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_16x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+12)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+12)*sdc], &pD[j*bs+(i+12)*sdd], m-(i+12), n-j);
+ }
+ }
+#endif
+#if defined(TARGET_ARMV7A_ARM_CORTEX_A15) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ }
+#endif
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_8x8_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+#endif
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ }
+ }
+#endif
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ }
+ return;
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+void sgemm_nn_lib(int m, int n, int k, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+void strmm_nt_ru_lib(int m, int n, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j;
+
+ i = 0;
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_strmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+
+ // common return
+ return;
+
+ left_4:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], alg, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+// }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A lower triangular with unit diagonal
+void strsm_nt_rl_one_lib(int m, int n, float *pA, int sda, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void strsm_nt_ru_inv_lib(int m, int n, float *pA, int sda, float *inv_diag_A, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, idx;
+
+ int rn = n%4;
+
+ float *dummy;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_strsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*bs], &pA[idx*sda+(idx+4)*bs], &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*bs], &pA[idx*sda+(idx+4)*bs], &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A lower triangular with unit diagonal
+void strsm_nn_ll_one_lib(int m, int n, float *pA, int sda, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j;
+
+ i = 0;
+
+ for( ; i<m-3; i+=4)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs);
+ }
+ if(j<n)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+
+ // common return
+ return;
+
+ left_4:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs, m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void strsm_nn_lu_inv_lib(int m, int n, float *pA, int sda, float *inv_diag_A, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, idx;
+ float *dummy;
+
+ i = 0;
+ int rm = m%4;
+ if(rm>0)
+ {
+ // TODO code expliticly the final case
+ idx = m-rm; // position of the part to do
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*bs, pD+idx*sdd+j*bs, pA+idx*sda+idx*bs, inv_diag_A+idx, rm, n-j);
+ }
+ // TODO
+ i += rm;
+ }
+// int em = m-rm;
+ for( ; i<m; i+=4)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_strsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*bs, pD+idx*sdd+j*bs, sdd, pB+(idx-4)*sdb+j*bs, pD+(idx-4)*sdd+j*bs, pA+(idx-4)*sda+(idx-4)*bs, inv_diag_A+(idx-4));
+ }
+ if(j<n)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*bs, pD+idx*sdd+j*bs, sdd, pB+(idx-4)*sdb+j*bs, pD+(idx-4)*sdd+j*bs, pA+(idx-4)*sda+(idx-4)*bs, inv_diag_A+(idx-4), 4, n-j);
+ }
+ }
+
+ // common return
+ return;
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm nt
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ if(ai==0 & bi==0 & ci==0 & di==0)
+ {
+ sgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+ pA += ai/bs*bs*sda;
+ pB += bi/bs*bs*sda;
+ int ci0 = ci-ai%bs;
+ int di0 = di-ai%bs;
+ int offsetC;
+ int offsetD;
+ if(ci0>=0)
+ {
+ pC += ci0/bs*bs*sdd;
+ offsetC = ci0%bs;
+ }
+ else
+ {
+ pC += -4*sdc;
+ offsetC = bs+ci0;
+ }
+ if(di0>=0)
+ {
+ pD += di0/bs*bs*sdd;
+ offsetD = di0%bs;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = bs+di0;
+ }
+
+ int i, j, l;
+
+ int idxB;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bi%bs!=0)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc]-bi%bs*bs, sdc, offsetD, &pD[j*bs+i*sdd]-bi%bs*bs, sdd, ai%bs, m-i, bi%bs, n-j);
+ j += bs-bi%bs;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc], sdc, offsetD, &pD[j*bs+i*sdd], sdd, ai%bs, m-i, 0, n-j);
+ idxB += 4;
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pC += bs*sdc;
+ pD += bs*sdd;
+ }
+ // main loop
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bi%bs!=0)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc]-bi%bs*bs, sdc, offsetD, &pD[j*bs+i*sdd]-bi%bs*bs, sdd, 0, m-i, bi%bs, n-j);
+ j += bs-bi%bs;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc], sdc, offsetD, &pD[j*bs+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// dgemm nn
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+ if(m<=0 || n<=0)
+ return;
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_nn_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ sgemm_nn_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_nn_llu
+void strsm_llnu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ strsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_nn_lun
+void strsm_lunn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ strsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ // TODO alpha
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+
+ int i, j;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 0;
+ }
+
+ if(m<=0 || n<=0)
+ return;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void strsm_rltu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ strsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void strsm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ strsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0)
+ {
+ printf("\nstrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ strmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+ {
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ pA += ai/bs*bs*sda;
+ pB += bi/bs*bs*sdb;
+ int offsetB = bi%bs;
+ int di0 = di-ai%bs;
+ int offsetD;
+ if(di0>=0)
+ {
+ pD += di0/bs*bs*sdd;
+ offsetD = di0%bs;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = bs+di0;
+ }
+
+ int ii, jj;
+
+ ii = 0;
+ if(ai%bs!=0)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pD += bs*sdd;
+ }
+ if(offsetD==0)
+ {
+ for(; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs]);
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, 0, &pD[ii*sdd+jj*bs], sdd, 0, 4, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ goto left_4;
+ }
+ }
+ else
+ {
+ for(; ii<m; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsryrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+// ssyrk_nt_l_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ kernel_ssyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, m-j);
+ }
+ kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, m-j);
+ return;
+
+ }
+
+
+
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsryrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+// ssyrk_nt_l_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_ssyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ else
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i && j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_lib8.c b/blas/s_blas3_lib8.c
new file mode 100644
index 0000000..f0f5144
--- /dev/null
+++ b/blas/s_blas3_lib8.c
@@ -0,0 +1,1325 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#if defined(DIM_CHECK)
+#include <stdio.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // TODO check that sA=!sD or that if sA==sD then they do not overlap (same for sB)
+ // non-negative size
+ if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
+ if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
+ if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
+ if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
+ if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
+ if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** sgemm_nt_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+k > sA->n) printf("\n***** sgemm_nt_libstr : aj+k > col(A) : %d+%d > %d *****\n", aj, k, sA->n);
+ // B: n x k
+ if(bi+n > sB->m) printf("\n***** sgemm_nt_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+ if(bj+k > sB->n) printf("\n***** sgemm_nt_libstr : bj+k > col(B) : %d+%d > %d *****\n", bj, k, sB->n);
+ // C: m x n
+ if(ci+m > sC->m) printf("\n***** sgemm_nt_libstr : ci+m > row(C) : %d+%d > %d *****\n", ci, n, sC->m);
+ if(cj+n > sC->n) printf("\n***** sgemm_nt_libstr : cj+n > col(C) : %d+%d > %d *****\n", cj, k, sC->n);
+ // D: m x n
+ if(di+m > sD->m) printf("\n***** sgemm_nt_libstr : di+m > row(D) : %d+%d > %d *****\n", di, n, sD->m);
+ if(dj+n > sD->n) printf("\n***** sgemm_nt_libstr : dj+n > col(D) : %d+%d > %d *****\n", dj, k, sD->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
+ }
+ }
+ }
+ if(m-i>0)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+// else if(m-i<=20)
+// {
+// goto left_20;
+// }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
+ }
+ }
+ }
+ if(m-i>0)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_24:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_20:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+left_12:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ return;
+#endif
+
+ left_8:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_4:
+ j = 0;
+ for(; j<n; j+=8)
+ {
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
+ if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
+ if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
+ if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
+ if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
+ if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** sgemm_nn_libstr : ai+m > row(A) : %d+%d > %d *****\n\n", ai, m, sA->m);
+ if(aj+k > sA->n) printf("\n***** sgemm_nn_libstr : aj+k > col(A) : %d+%d > %d *****\n\n", aj, k, sA->n);
+ // B: k x n
+ if(bi+k > sB->m) printf("\n***** sgemm_nn_libstr : bi+k > row(B) : %d+%d > %d *****\n\n", bi, k, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgemm_nn_libstr : bj+n > col(B) : %d+%d > %d *****\n\n", bj, n, sB->n);
+ // C: m x n
+ if(ci+m > sC->m) printf("\n***** sgemm_nn_libstr : ci+m > row(C) : %d+%d > %d *****\n\n", ci, n, sC->m);
+ if(cj+n > sC->n) printf("\n***** sgemm_nn_libstr : cj+n > col(C) : %d+%d > %d *****\n\n", cj, k, sC->n);
+ // D: m x n
+ if(di+m > sD->m) printf("\n***** sgemm_nn_libstr : di+m > row(D) : %d+%d > %d *****\n\n", di, n, sD->m);
+ if(dj+n > sD->n) printf("\n***** sgemm_nn_libstr : dj+n > col(D) : %d+%d > %d *****\n\n", dj, k, sD->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs + bi/bs*bs*sdb;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int offsetB = bi%bs;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+#if 1
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#else
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+#if 1
+ kernel_sgemm_nn_8x8_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+#else
+ kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+ kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], &pD[(j+4)*bs+i*sdd]);
+#endif
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+ if(j<n-4)
+ {
+ kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, 0, &pC[(j+4)*bs+i*sdc], sdc, 0, &pD[(j+4)*bs+i*sdd], sdd, 0, 8, 0, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, 0, &pC[(j+0)*bs+i*sdc], sdc, 0, &pD[(j+0)*bs+i*sdd], sdd, 0, 8, 0, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_8;
+ }
+#endif
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nn_8x8_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_8x4_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nssyrk_ln_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+
+ kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24: // 17 <= m <= 23
+ j = 0;
+ for(; j<i & j<m-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
+ }
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
+ kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+8));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+12));
+ if(j<m-20) // 21 - 23
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
+ }
+ return;
+#endif
+
+ left_16: // 13 <= m <= 16
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
+ }
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
+ if(j<m-12) // 13 - 16
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+ }
+ return;
+
+ left_12: // 9 <= m <= 12
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+0), m-(j+0));
+ }
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+8)*sdc], &pD[(j+0)*bs+(j+8)*sdd], m-(i+8), m-(j+0));
+ if(j<m-8) // 9 - 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+ }
+ return;
+
+ left_8: // 5 <= m <= 8
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+ }
+ if(j<m-4) // 5 - 8
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ }
+ else // 1 - 4
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ }
+ return;
+
+ left_4: // 1 <= m <= 4
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+ }
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ return;
+
+ }
+
+
+
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nssyrk_ln_mn_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-23)
+ {
+ kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+ if(j==n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
+ if(j==n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-15)
+ {
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+ if(j==n-8) // 8
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
+ if(j>=n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
+ }
+ }
+ }
+ return;
+
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+ {
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ pA += ai/bs*bs*sda;
+ pB += bi/bs*bs*sdb;
+ int offsetB = bi%bs;
+ int di0 = di-ai%bs;
+ int offsetD;
+ if(di0>=0)
+ {
+ pD += di0/bs*bs*sdd;
+ offsetD = di0%bs;
+ }
+ else
+ {
+ pD += -8*sdd;
+ offsetD = bs+di0;
+ }
+
+ int ii, jj;
+
+ int offsetB4;
+
+ if(offsetB<4)
+ {
+ offsetB4 = offsetB+4;
+ ii = 0;
+ if(ai%bs!=0)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pD += bs*sdd;
+ }
+ if(offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ // XXX create left_24 once the _gen_ kernel exist !!!
+ for(; ii<m-23; ii+=24)
+ {
+ jj = 0;
+ for(; jj<n-7; jj+=8)
+ {
+ kernel_strmm_nn_rl_24x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+ kernel_strmm_nn_rl_24x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_24x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 24, n-jj);
+ if(n-jj>4)
+ {
+ kernel_strmm_nn_rl_24x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 24, n-jj-4);
+ }
+ }
+ }
+#endif
+ for(; ii<m-15; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-7; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+ kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 16, n-jj);
+ if(n-jj>4)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 16, n-jj-4);
+ }
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii<=8)
+ goto left_8;
+ else
+ goto left_16;
+ }
+ }
+ else
+ {
+ for(; ii<m-8; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ if(m-ii>0)
+ goto left_8;
+ }
+ }
+ else
+ {
+ offsetB4 = offsetB-4;
+ ii = 0;
+ if(ai%bs!=0)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pD += bs*sdd;
+ }
+ if(offsetD==0)
+ {
+ for(; ii<m-15; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-7; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+ kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 8, n-jj);
+ if(n-jj>4)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 8, n-jj-4);
+ }
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii<=8)
+ goto left_8;
+ else
+ goto left_16;
+ }
+ }
+ else
+ {
+ for(; ii<m-8; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ if(m-ii>0)
+ goto left_8;
+ }
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_16:
+ if(offsetB<4)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ else
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ return;
+
+ left_8:
+ if(offsetB<4)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ else
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ return;
+
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ // TODO alpha
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+
+ int i, j;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 0;
+ }
+
+ if(m<=0 || n<=0)
+ return;
+
+ i = 0;
+
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x4_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0]);
+ kernel_strsm_nt_rl_inv_8x4_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+0]);
+ }
+ if(n-j>0)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+ if(n-j>4)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_8;
+ }
+
+ // common return if i==m
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
+ }
+ if(n-j>0)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+ }
+ return;
+
+ }
+
+
+
+
diff --git a/blas/s_blas_64.h b/blas/s_blas_64.h
new file mode 100644
index 0000000..1589867
--- /dev/null
+++ b/blas/s_blas_64.h
@@ -0,0 +1,65 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void scopy_(long long *m, float *x, long long *incx, float *y, long long *incy);
+void saxpy_(long long *m, float *alpha, float *x, long long *incx, float *y, long long *incy);
+void sscal_(long long *m, float *alpha, float *x, long long *incx);
+
+// level 2
+void sgemv_(char *ta, long long *m, long long *n, float *alpha, float *A, long long *lda, float *x, long long *incx, float *beta, float *y, long long *incy);
+void ssymv_(char *uplo, long long *m, float *alpha, float *A, long long *lda, float *x, long long *incx, float *beta, float *y, long long *incy);
+void strmv_(char *uplo, char *trans, char *diag, long long *n, float *A, long long *lda, float *x, long long *incx);
+void strsv_(char *uplo, char *trans, char *diag, long long *n, float *A, long long *lda, float *x, long long *incx);
+void sger_(long long *m, long long *n, float *alpha, float *x, long long *incx, float *y, long long *incy, float *A, long long *lda);
+
+// level 3
+void sgemm_(char *ta, char *tb, long long *m, long long *n, long long *k, float *alpha, float *A, long long *lda, float *B, long long *ldb, float *beta, float *C, long long *ldc);
+void ssyrk_(char *uplo, char *trans, long long *n, long long *k, float *alpha, float *A, long long *lda, float *beta, float *C, long long *ldc);
+void strmm_(char *side, char *uplo, char *transa, char *diag, long long *m, long long *n, float *alpha, float *A, long long *lda, float *B, long long *ldb);
+void strsm_(char *side, char *uplo, char *transa, char *diag, long long *m, long long *n, float *alpha, float *A, long long *lda, float *B, long long *ldb);
+
+// lapack
+long long spotrf_(char *uplo, long long *m, float *A, long long *lda, long long *info);
+long long sgetrf_(long long *m, long long *n, float *A, long long *lda, long long *ipiv, long long *info);
+void sgeqrf_(long long *m, long long *n, float *A, long long *lda, float *tau, float *work, long long *lwork, long long *info);
+void sgeqr2_(long long *m, long long *n, float *A, long long *lda, float *tau, float *work, long long *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/s_lapack_lib.c b/blas/s_lapack_lib.c
new file mode 100644
index 0000000..c7cb56b
--- /dev/null
+++ b/blas/s_lapack_lib.c
@@ -0,0 +1,76 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "s_blas_64.h"
+#else
+#include "s_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GELQF_LIBSTR sgelqf_libstr
+#define GELQF_WORK_SIZE_LIBSTR sgelqf_work_size_libstr
+#define GEQRF_LIBSTR sgeqrf_libstr
+#define GEQRF_WORK_SIZE_LIBSTR sgeqrf_work_size_libstr
+#define GETF2_NOPIVOT sgetf2_nopivot
+#define GETRF_NOPIVOT_LIBSTR sgetrf_nopivot_libstr
+#define GETRF_LIBSTR sgetrf_libstr
+#define POTRF_L_LIBSTR spotrf_l_libstr
+#define POTRF_L_MN_LIBSTR spotrf_l_mn_libstr
+#define SYRK_POTRF_LN_LIBSTR ssyrk_spotrf_ln_libstr
+
+#define COPY scopy_
+#define GELQF sgelqf_
+#define GEMM sgemm_
+#define GER sger_
+#define GEQRF sgeqrf_
+#define GEQR2 sgeqr2_
+#define GETRF sgetrf_
+#define POTRF spotrf_
+#define SCAL sscal_
+#define SYRK ssyrk_
+#define TRSM strsm_
+
+
+#include "x_lapack_lib.c"
+
diff --git a/blas/s_lapack_lib4.c b/blas/s_lapack_lib4.c
new file mode 100644
index 0000000..7d02d36
--- /dev/null
+++ b/blas/s_lapack_lib4.c
@@ -0,0 +1,664 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void ssyrk_spotrf_nt_l_lib(int m, int n, int k, float *pA, int sda, float *pB, int sdb, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ int alg = 1; // XXX
+
+ const int bs = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_ssyrk_spotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j]);
+ }
+ else
+ {
+ kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ }
+ return;
+
+ }
+
+
+
+void sgetrf_nn_nopivot_lib(int m, int n, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, jj, ie;
+
+ // main loop
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_strsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_sgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd]);
+ }
+ if(jj<n)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd], m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd], m-ii, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void sgetrf_nn_lib(int m, int n, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D, int *ipiv)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, jj, i0, i1, j0, ll, p;
+
+ float d1 = 1.0;
+ float dm1 = -1.0;
+
+// // needs to perform row-excanges on the yet-to-be-factorized matrix too
+// if(pC!=pD)
+// sgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
+
+ // minimum matrix size
+ p = n<m ? n : m; // XXX
+
+ // main loop
+ // 4 columns at a time
+ jj = 0;
+ for(; jj<p-3; jj+=4) // XXX
+ {
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd]);
+ }
+ if(m-ii>0)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, 4);
+ }
+ kernel_sgetrf_pivot_4_lib4(m-i0, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd]);
+ }
+ if(n-ll>0)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], 4, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ goto left_n_4;
+ }
+ }
+ else
+ {
+ if(m-jj>0)
+ {
+ goto left_m_4;
+ }
+ }
+
+ // common return if jj==n
+ return;
+
+ // clean up
+
+ left_n_4:
+ // 1-4 columns at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m; ii+=4)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, n-jj);
+ }
+ kernel_sgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+ }
+ if(n-jj>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+ }
+ if(n-jj>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+ }
+ if(n-jj>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ if(0) // there is no upper
+ {
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], m-i0, n-ll);
+ }
+ }
+ return;
+
+
+ left_m_4:
+ // 1-4 rows at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, n-jj);
+ kernel_sgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+ }
+ if(m-i0>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+ }
+ if(m-i0>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+ }
+ if(m-i0>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], m-i0, n-ll);
+ }
+ return;
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dpotrf
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\nspotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA;
+ if(di==0 && dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ kernel_spotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4: // 1 - 3
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ return;
+
+ return;
+ }
+
+
+
+// dpotrf
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\nspotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA;
+ if(di==0 && dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ if(j<n-3)
+ {
+ kernel_spotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ else
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ return;
+
+ return;
+ }
+
+
+
+// dsyrk dpotrf
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nssyrk_spotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ ssyrk_spotrf_nt_l_lib(m, n, k, pA, sda, pB, sdb, pC, sdc, pD, sdd, dD);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+void sgetrf_nopivot_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\nsgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ sgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+
+// dgetrf pivoting
+void sgetrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, int *ipiv)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\nsgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ // needs to perform row-excanges on the yet-to-be-factorized matrix too
+ if(pC!=pD)
+ sgecp_libstr(m, n, sC, ci, cj, sD, di, dj);
+ sgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+int sgeqrf_work_size_libstr(int m, int n)
+ {
+ printf("\nsgeqrf_work_size_libstr: feature not implemented yet\n");
+ exit(1);
+ return 0;
+ }
+
+
+
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ printf("\nsgeqrf_libstr: feature not implemented yet\n");
+ exit(1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/s_lapack_lib8.c b/blas/s_lapack_lib8.c
new file mode 100644
index 0000000..3b5239e
--- /dev/null
+++ b/blas/s_lapack_lib8.c
@@ -0,0 +1,872 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nspotrf_l_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ kernel_spotrf_nt_l_24x4_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_20x4_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_16x4_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+ kernel_spotrf_nt_l_12x4_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+ kernel_spotrf_nt_l_8x8_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ kernel_spotrf_nt_l_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_12x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_8x8_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24: // 17 <= m <= 23
+ j = 0;
+ for(; j<i & j<m-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, m-(j+0));
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, m-(j+4));
+ }
+ kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), m-(j+0));
+ kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), m-(j+4));
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), m-(j+8));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), m-(j+12));
+ if(j<m-20) // 21 - 23
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), m-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), m-(j+16));
+ }
+ return;
+#endif
+
+ left_16: // 9 <= m <= 16
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, m-(j+0));
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, m-(j+4));
+ }
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), m-(j+0));
+ kernel_spotrf_nt_l_12x4_vs_lib8(j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), m-(j+4));
+ if(j<m-12) // 13 - 16
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), m-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), m-(j+8));
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12: // 9 <= m <= 12
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[(i+8)*sdd], &pD[j*sdd], &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], &pD[j*bs+j*sdd], &dD[j], m-(i+8), m-j);
+ }
+ kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[(i+8)*sdd], &pD[j*sdd], &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], &pD[j*bs+j*sdd], &dD[j], m-(i+8), m-j);
+ if(j<m-8) // 9 - 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[(j+8)], m-(i+8), m-(j+8));
+ }
+ return;
+#endif
+
+ left_8: // 1 <= m <= 8
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ if(j<m-4) // 5 - 8
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ else // 1 - 4
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4: // 1 <= m <= 4
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ return;
+#endif
+
+ }
+
+
+
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nspotrf_l_mn_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-23)
+ {
+ kernel_spotrf_nt_l_24x4_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_20x4_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_16x4_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+ kernel_spotrf_nt_l_12x4_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+ kernel_spotrf_nt_l_8x8_lib8((j+16), &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j==n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-15)
+ {
+ kernel_spotrf_nt_l_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_12x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_8x8_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8) // 8
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 10 11 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j>=n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_12x4_vs_lib8(j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nssyrk_spotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+
+// ssyrk_spotrf_nt_l_lib(m, n, k, pA, sda, pB, sdb, pC, sdc, pD, sdd, dD);
+
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-23)
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_ssyrk_spotrf_nt_l_20x4_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_ssyrk_spotrf_nt_l_16x4_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+ kernel_ssyrk_spotrf_nt_l_12x4_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], (j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+ kernel_ssyrk_spotrf_nt_l_8x8_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], (j+16), &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], (j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j==n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-15)
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_ssyrk_spotrf_nt_l_12x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_ssyrk_spotrf_nt_l_8x8_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], j+4, &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8) // 8
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], j+12, &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j>=n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[0+j*sdb], j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+j*sdb], j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ return;
+
+ }
+
+
+
+int sgeqrf_work_size_libstr(int m, int n)
+ {
+ printf("\nsgeqrf_work_size_libstr: feature not implemented yet\n");
+ exit(1);
+ return 0;
+ }
+
+
+
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ printf("\nsgeqrf_libstr: feature not implemented yet\n");
+ exit(1);
+ return;
+ }
+
+
+
+
diff --git a/blas/x_blas1_lib.c b/blas/x_blas1_lib.c
new file mode 100644
index 0000000..5f8fc2e
--- /dev/null
+++ b/blas/x_blas1_lib.c
@@ -0,0 +1,186 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+void AXPY_LIBSTR(int m, REAL alpha, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return;
+ int ii;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for(; ii<m; ii++)
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+REAL VECMULDOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ z[ii+1] = x[ii+1] * y[ii+1];
+ z[ii+2] = x[ii+2] * y[ii+2];
+ z[ii+3] = x[ii+3] * y[ii+3];
+ dot += z[ii+0] + z[ii+1] + z[ii+2] + z[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+// compute dot product of two vectors
+REAL DOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ dot += x[ii+0] * y[ii+0];
+ dot += x[ii+1] * y[ii+1];
+ dot += x[ii+2] * y[ii+2];
+ dot += x[ii+3] * y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ dot += x[ii+0] * y[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+void AXPY_LIBSTR(int m, REAL alpha, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return;
+ int i1 = 1;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ if(y!=z)
+ COPY(&m, y, &i1, z, &i1);
+ AXPY(&m, &alpha, x, &i1, z, &i1);
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+REAL VECMULDOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+// compute dot product of two vectors
+REAL DOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ dot += x[ii+0] * y[ii+0];
+ dot += x[ii+1] * y[ii+1];
+ dot += x[ii+2] * y[ii+2];
+ dot += x[ii+3] * y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ dot += x[ii+0] * y[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/x_blas2_diag_lib.c b/blas/x_blas2_diag_lib.c
new file mode 100644
index 0000000..e90cbd6
--- /dev/null
+++ b/blas/x_blas2_diag_lib.c
@@ -0,0 +1,51 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+void GEMV_DIAG_LIBSTR(int m, REAL alpha, struct STRVEC *sA, int ai, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return;
+ int ii;
+ REAL *a = sA->pa + ai;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ if(alpha==1.0 & beta==1.0)
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = a[ii]*x[ii] + y[ii];
+ }
+ else
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha*a[ii]*x[ii] + beta*y[ii];
+ }
+
+ return;
+
+ }
diff --git a/blas/x_blas2_lib.c b/blas/x_blas2_lib.c
new file mode 100644
index 0000000..32e1e0a
--- /dev/null
+++ b/blas/x_blas2_lib.c
@@ -0,0 +1,1466 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+void GEMV_N_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1, y_2, y_3,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+#if 1 // y reg version
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 = 0.0;
+ y_1 = 0.0;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[jj+0] + pA[ii+0+lda*(jj+1)] * x[jj+1];
+ y_1 += pA[ii+1+lda*(jj+0)] * x[jj+0] + pA[ii+1+lda*(jj+1)] * x[jj+1];
+ }
+ if(jj<n)
+ {
+ y_0 += pA[ii+0+lda*jj] * x[jj];
+ y_1 += pA[ii+1+lda*jj] * x[jj];
+ }
+ z[ii+0] = beta * y[ii+0] + alpha * y_0;
+ z[ii+1] = beta * y[ii+1] + alpha * y_1;
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 = 0.0;
+ for(jj=0; jj<n; jj++)
+ {
+ y_0 += pA[ii+lda*jj] * x[jj];
+ }
+ z[ii] = beta * y[ii] + alpha * y_0;
+ }
+#else // x reg version
+ for(ii=0; ii<n; ii++)
+ {
+ z[ii] = beta * y[ii];
+ }
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ x_0 = alpha * x[jj+0];
+ x_1 = alpha * x[jj+1];
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+ z[ii+1] += pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii] += pA[ii+lda*(jj+0)] * x_0;
+ z[ii] += pA[ii+lda*(jj+1)] * x_1;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ x_0 = alpha * x[jj+0];
+ for(ii=0; ii<m; ii++)
+ {
+ z[ii] += pA[ii+lda*(jj+0)] * x_0;
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+void GEMV_T_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 = 0.0;
+ y_1 = 0.0;
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+ y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 += pA[ii+lda*(jj+0)] * x[ii];
+ y_1 += pA[ii+lda*(jj+1)] * x[ii];
+ }
+ z[jj+0] = beta * y[jj+0] + alpha * y_0;
+ z[jj+1] = beta * y[jj+1] + alpha * y_1;
+ }
+ for(; jj<n; jj++)
+ {
+ y_0 = 0.0;
+ for(ii=0; ii<m; ii++)
+ {
+ y_0 += pA[ii+lda*(jj+0)] * x[ii];
+ }
+ z[jj+0] = beta * y[jj+0] + alpha * y_0;
+ }
+ return;
+ }
+
+
+
+// TODO optimize !!!!!
+void GEMV_NT_LIBSTR(int m, int n, REAL alpha_n, REAL alpha_t, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx_n, int xi_n, struct STRVEC *sx_t, int xi_t, REAL beta_n, REAL beta_t, struct STRVEC *sy_n, int yi_n, struct STRVEC *sy_t, int yi_t, struct STRVEC *sz_n, int zi_n, struct STRVEC *sz_t, int zi_t)
+ {
+ int ii, jj;
+ REAL
+ a_00,
+ x_n_0,
+ y_t_0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x_n = sx_n->pa + xi_n;
+ REAL *x_t = sx_t->pa + xi_t;
+ REAL *y_n = sy_n->pa + yi_n;
+ REAL *y_t = sy_t->pa + yi_t;
+ REAL *z_n = sz_n->pa + zi_n;
+ REAL *z_t = sz_t->pa + zi_t;
+ for(ii=0; ii<m; ii++)
+ {
+ z_n[ii] = beta_n * y_n[ii];
+ }
+ for(jj=0; jj<n; jj++)
+ {
+ y_t_0 = 0.0;
+ x_n_0 = alpha_n * x_n[jj];
+ for(ii=0; ii<m; ii++)
+ {
+ a_00 = pA[ii+lda*jj];
+ z_n[ii] += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t[ii];
+ }
+ z_t[jj] = beta_t * y_t[jj] + alpha_t * y_t_0;
+ }
+ return;
+ }
+
+
+
+// TODO optimize !!!!!
+void SYMV_L_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ for(ii=0; ii<n; ii++)
+ {
+ y_0 = 0.0;
+ jj = 0;
+ for(; jj<=ii; jj++)
+ {
+ y_0 += pA[ii+lda*jj] * x[jj];
+ }
+ for( ; jj<m; jj++)
+ {
+ y_0 += pA[jj+lda*ii] * x[jj];
+ }
+ z[ii] = beta * y[ii] + alpha * y_0;
+ }
+ return;
+ }
+
+
+
+void TRMV_LNN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(m-n>0)
+ {
+ GEMV_N_LIBSTR(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+ }
+ if(n%2!=0)
+ {
+ ii = n-1;
+ y_0 = x[ii];
+ y_0 *= pA[ii+lda*ii];
+ for(jj=0; jj<ii; jj++)
+ {
+ y_0 += pA[ii+lda*jj] * x[jj];
+ }
+ z[ii] = y_0;
+ n -= 1;
+ }
+ for(ii=n-2; ii>=0; ii-=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ y_1 *= pA[ii+1+lda*(ii+1)];
+ y_1 += pA[ii+1+lda*(ii+0)] * y_0;
+ y_0 *= pA[ii+0+lda*(ii+0)];
+ jj = 0;
+ for(; jj<ii-1; jj+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[jj+0] + pA[ii+0+lda*(jj+1)] * x[jj+1];
+ y_1 += pA[ii+1+lda*(jj+0)] * x[jj+0] + pA[ii+1+lda*(jj+1)] * x[jj+1];
+ }
+// XXX there is no clean up loop !!!!!
+// for(; jj<ii; jj++)
+// {
+// y_0 += pA[ii+0+lda*jj] * x[jj];
+// y_1 += pA[ii+1+lda*jj] * x[jj];
+// }
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRMV_LTN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ y_0 *= pA[jj+0+lda*(jj+0)];
+ y_0 += pA[jj+1+lda*(jj+0)] * y_1;
+ y_1 *= pA[jj+1+lda*(jj+1)];
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+ y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 += pA[ii+lda*(jj+0)] * x[ii];
+ y_1 += pA[ii+lda*(jj+1)] * x[ii];
+ }
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ for(; jj<n; jj++)
+ {
+ y_0 = x[jj];
+ y_0 *= pA[jj+lda*jj];
+ for(ii=jj+1; ii<m; ii++)
+ {
+ y_0 += pA[ii+lda*jj] * x[ii];
+ }
+ z[jj] = y_0;
+ }
+ return;
+ }
+
+
+
+void TRMV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+#if 1 // y reg version
+ jj = 0;
+ for(; jj<m-1; jj+=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ y_0 = pA[jj+0+lda*(jj+0)] * y_0;
+ y_0 += pA[jj+0+lda*(jj+1)] * y_1;
+ y_1 = pA[jj+1+lda*(jj+1)] * y_1;
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 += pA[jj+0+lda*(ii+0)] * x[ii+0] + pA[jj+0+lda*(ii+1)] * x[ii+1];
+ y_1 += pA[jj+1+lda*(ii+0)] * x[ii+0] + pA[jj+1+lda*(ii+1)] * x[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 += pA[jj+0+lda*(ii+0)] * x[ii+0];
+ y_1 += pA[jj+1+lda*(ii+0)] * x[ii+0];
+ }
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ for(; jj<m; jj++)
+ {
+ y_0 = pA[jj+lda*jj] * x[jj];
+ for(ii=jj+1; ii<m; ii++)
+ {
+ y_0 += pA[jj+lda*ii] * x[ii];
+ }
+ z[jj] = y_0;
+ }
+#else // x reg version
+ if(x != z)
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = x[ii];
+ }
+ jj = 0;
+ for(; jj<m-1; jj+=2)
+ {
+ x_0 = z[jj+0];
+ x_1 = z[jj+1];
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+ z[ii+1] += pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+ }
+// XXX there is no clean-up loop, since jj+=2 !!!!!
+// for(; ii<jj; ii++)
+// {
+// z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+// }
+ x_0 *= pA[jj+0+lda*(jj+0)];
+ x_0 += pA[jj+0+lda*(jj+1)] * x_1;
+ x_1 *= pA[jj+1+lda*(jj+1)];
+ z[jj+0] = x_0;
+ z[jj+1] = x_1;
+ }
+ for(; jj<m; jj++)
+ {
+ x_0 = z[jj];
+ for(ii=0; ii<jj; ii++)
+ {
+ z[ii] += pA[ii+lda*jj] * x_0;
+ }
+ x_0 *= pA[jj+lda*jj];
+ z[jj] = x_0;
+ }
+#endif
+ return;
+ }
+
+
+
+void TRMV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(m%2!=0)
+ {
+ jj = m-1;
+ y_0 = pA[jj+lda*jj] * x[jj];
+ for(ii=0; ii<jj; ii++)
+ {
+ y_0 += pA[ii+lda*jj] * x[ii];
+ }
+ z[jj] = y_0;
+ m -= 1; // XXX
+ }
+ for(jj=m-2; jj>=0; jj-=2)
+ {
+ y_1 = pA[jj+1+lda*(jj+1)] * x[jj+1];
+ y_1 += pA[jj+0+lda*(jj+1)] * x[jj+0];
+ y_0 = pA[jj+0+lda*(jj+0)] * x[jj+0];
+ for(ii=0; ii<jj-1; ii+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+ y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+ }
+// XXX there is no clean-up loop !!!!!
+// if(ii<jj)
+// {
+// y_0 += pA[ii+lda*(jj+0)] * x[ii];
+// y_1 += pA[ii+lda*(jj+1)] * x[ii];
+// }
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRSV_LNN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0 | n==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj, j1;
+ REAL
+ y_0, y_1,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+#if 1 // y reg version
+ ii = 0;
+ for(; ii<n-1; ii+=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ jj = 0;
+ for(; jj<ii-1; jj+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+ }
+// XXX there is no clean-up loop !!!!!
+// if(jj<ii)
+// {
+// y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0];
+// y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0];
+// }
+ y_0 *= dA[ii+0];
+ y_1 -= pA[ii+1+lda*(jj+0)] * y_0;
+ y_1 *= dA[ii+1];
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ for(; ii<n; ii++)
+ {
+ y_0 = x[ii];
+ for(jj=0; jj<ii; jj++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[jj];
+ }
+ y_0 *= dA[ii];
+ z[ii] = y_0;
+ }
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+ }
+ if(jj<n)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0];
+ }
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 = x[ii];
+ for(jj=0; jj<n; jj++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[jj];
+ }
+ z[ii] = y_0;
+ }
+#else // x reg version
+ if(x != z)
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = x[ii];
+ }
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ x_0 = dA[jj+0] * z[jj+0];
+ x_1 = z[jj+1] - pA[jj+1+lda*(jj+0)] * x_0;
+ x_1 = dA[jj+1] * x_1;
+ z[jj+0] = x_0;
+ z[jj+1] = x_1;
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ z[ii+0] -= pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+ z[ii+1] -= pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii] -= pA[ii+lda*(jj+0)] * x_0 + pA[ii+lda*(jj+1)] * x_1;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ x_0 = dA[jj] * z[jj];
+ z[jj] = x_0;
+ for(ii=jj+1; ii<m; ii++)
+ {
+ z[ii] -= pA[ii+lda*jj] * x_0;
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+void TRSV_LTN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ if(n%2!=0)
+ {
+ jj = n-1;
+ y_0 = x[jj];
+ for(ii=jj+1; ii<m; ii++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[ii];
+ }
+ y_0 *= dA[jj];
+ z[jj] = y_0;
+ jj -= 2;
+ }
+ else
+ {
+ jj = n-2;
+ }
+ for(; jj>=0; jj-=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[ii+0] + pA[ii+1+lda*(jj+0)] * z[ii+1];
+ y_1 -= pA[ii+0+lda*(jj+1)] * z[ii+0] + pA[ii+1+lda*(jj+1)] * z[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 -= pA[ii+lda*(jj+0)] * z[ii];
+ y_1 -= pA[ii+lda*(jj+1)] * z[ii];
+ }
+ y_1 *= dA[jj+1];
+ y_0 -= pA[jj+1+lda*(jj+0)] * y_1;
+ y_0 *= dA[jj+0];
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRSV_LNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj, j1;
+ REAL
+ y_0, y_1,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ jj = 0;
+ for(; jj<ii-1; jj+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+ }
+ y_0 *= dA[ii+0];
+ y_1 -= pA[ii+1+lda*(jj+0)] * y_0;
+ y_1 *= dA[ii+1];
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 = x[ii];
+ for(jj=0; jj<ii; jj++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[jj];
+ }
+ y_0 *= dA[ii];
+ z[ii] = y_0;
+ }
+ return;
+ }
+
+
+
+void TRSV_LNU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_lnu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void TRSV_LTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ if(m%2!=0)
+ {
+ jj = m-1;
+ y_0 = x[jj];
+ y_0 *= dA[jj];
+ z[jj] = y_0;
+ jj -= 2;
+ }
+ else
+ {
+ jj = m-2;
+ }
+ for(; jj>=0; jj-=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[ii+0] + pA[ii+1+lda*(jj+0)] * z[ii+1];
+ y_1 -= pA[ii+0+lda*(jj+1)] * z[ii+0] + pA[ii+1+lda*(jj+1)] * z[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 -= pA[ii+lda*(jj+0)] * z[ii];
+ y_1 -= pA[ii+lda*(jj+1)] * z[ii];
+ }
+ y_1 *= dA[jj+1];
+ y_0 -= pA[jj+1+lda*(jj+0)] * y_1;
+ y_0 *= dA[jj+0];
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRSV_LTU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_ltu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void TRSV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_unn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void TRSV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_utn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+void GEMV_N_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, y, &i1, z, &i1);
+ GEMV(&cn, &m, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+ return;
+ }
+
+
+
+void GEMV_T_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ COPY(&n, y, &i1, z, &i1);
+ GEMV(&ct, &m, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+ return;
+ }
+
+
+
+void GEMV_NT_LIBSTR(int m, int n, REAL alpha_n, REAL alpha_t, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx_n, int xi_n, struct STRVEC *sx_t, int xi_t, REAL beta_n, REAL beta_t, struct STRVEC *sy_n, int yi_n, struct STRVEC *sy_t, int yi_t, struct STRVEC *sz_n, int zi_n, struct STRVEC *sz_t, int zi_t)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x_n = sx_n->pa + xi_n;
+ REAL *x_t = sx_t->pa + xi_t;
+ REAL *y_n = sy_n->pa + yi_n;
+ REAL *y_t = sy_t->pa + yi_t;
+ REAL *z_n = sz_n->pa + zi_n;
+ REAL *z_t = sz_t->pa + zi_t;
+ COPY(&m, y_n, &i1, z_n, &i1);
+ GEMV(&cn, &m, &n, &alpha_n, pA, &lda, x_n, &i1, &beta_n, z_n, &i1);
+ COPY(&n, y_t, &i1, z_t, &i1);
+ GEMV(&ct, &m, &n, &alpha_t, pA, &lda, x_t, &i1, &beta_t, z_t, &i1);
+ return;
+ }
+
+
+
+void SYMV_L_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ int tmp = m-n;
+ COPY(&m, y, &i1, z, &i1);
+ SYMV(&cl, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+ GEMV(&cn, &tmp, &n, &alpha, pA+n, &lda, x, &i1, &beta, z+n, &i1);
+ GEMV(&ct, &tmp, &n, &alpha, pA+n, &lda, x+n, &i1, &d1, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_LNN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL d0 = 0.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ int tmp = m-n;
+ if(x!=z)
+ COPY(&n, x, &i1, z, &i1);
+ GEMV(&cn, &tmp, &n, &d1, pA+n, &lda, x, &i1, &d0, z+n, &i1);
+ TRMV(&cl, &cn, &cn, &n, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_LTN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ int tmp = m-n;
+ if(x!=z)
+ COPY(&n, x, &i1, z, &i1);
+ TRMV(&cl, &ct, &cn, &n, pA, &lda, z, &i1);
+ GEMV(&ct, &tmp, &n, &d1, pA+n, &lda, x+n, &i1, &d1, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRMV(&cu, &cn, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRMV(&cu, &ct, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LNN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0 | n==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int mmn = m-n;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &cn, &cn, &n, pA, &lda, z, &i1);
+ GEMV(&cn, &mmn, &n, &dm1, pA+n, &lda, z, &i1, &d1, z+n, &i1);
+ return;
+ }
+
+
+
+void TRSV_LTN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int mmn = m-n;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ GEMV(&ct, &mmn, &n, &dm1, pA+n, &lda, z+n, &i1, &d1, z, &i1);
+ TRSV(&cl, &ct, &cn, &n, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &cn, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LNU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &cn, &cu, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &ct, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LTU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &ct, &cu, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cu, &cn, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cu, &ct, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/x_blas3_diag_lib.c b/blas/x_blas3_diag_lib.c
new file mode 100644
index 0000000..d5cce93
--- /dev/null
+++ b/blas/x_blas3_diag_lib.c
@@ -0,0 +1,170 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void GEMM_L_DIAG_LIBSTR(int m, int n, REAL alpha, struct STRVEC *sA, int ai, struct STRMAT *sB, int bi, int bj, double beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *dA = sA->pa + ai;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL a0, a1;
+ if(beta==0.0)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ a0 = alpha * dA[ii+0];
+ a1 = alpha * dA[ii+1];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj];
+ pD[ii+1+ldd*jj] = a1 * pB[ii+1+ldb*jj];
+ }
+ }
+ for(; ii<m; ii++)
+ {
+ a0 = alpha * dA[ii];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj];
+ }
+ }
+ }
+ else
+ {
+ int ldc = sC->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ a0 = alpha * dA[ii+0];
+ a1 = alpha * dA[ii+1];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj] + beta * pC[ii+0+ldc*jj];
+ pD[ii+1+ldd*jj] = a1 * pB[ii+1+ldb*jj] + beta * pC[ii+1+ldc*jj];
+ }
+ }
+ for(; ii<m; ii++)
+ {
+ a0 = alpha * dA[ii];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj] + beta * pC[ii+0+ldc*jj];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void GEMM_R_DIAG_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sB, int bi, double beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj;
+ int lda = sA->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dB = sB->pa + bi;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL a0, a1;
+ if(beta==0)
+ {
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ a0 = alpha * dB[jj+0];
+ a1 = alpha * dB[jj+1];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)];
+ pD[ii+ldd*(jj+1)] = a1 * pA[ii+lda*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ a0 = alpha * dB[jj+0];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)];
+ }
+ }
+ }
+ else
+ {
+ int ldc = sC->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ a0 = alpha * dB[jj+0];
+ a1 = alpha * dB[jj+1];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)] + beta * pC[ii+ldc*(jj+0)];
+ pD[ii+ldd*(jj+1)] = a1 * pA[ii+lda*(jj+1)] + beta * pC[ii+ldc*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ a0 = alpha * dB[jj+0];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)] + beta * pC[ii+ldc*(jj+0)];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+
diff --git a/blas/x_blas3_lib.c b/blas/x_blas3_lib.c
new file mode 100644
index 0000000..29a33c7
--- /dev/null
+++ b/blas/x_blas3_lib.c
@@ -0,0 +1,1531 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+ c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ }
+ }
+ return;
+ }
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ c_01 = 0.0; ;
+ c_11 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+ c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ c_01 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ }
+ }
+ return;
+ }
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ d_00, d_01,
+ d_10, d_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda; // triangular
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+#if 1
+ // solve
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ d_10 = alpha * pB[ii+1+ldb*(jj+0)];
+ d_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ d_11 = alpha * pB[ii+1+ldb*(jj+1)];
+ kk = 0;
+#if 0
+ for(; kk<ii-1; kk+=2)
+ {
+ d_00 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+ d_10 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+ d_11 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+ d_00 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+ d_10 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+ d_11 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+ }
+ if(kk<ii)
+#else
+ for(; kk<ii; kk++)
+#endif
+ {
+ d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
+ }
+ d_10 -= pA[ii+1+lda*kk] * d_00;
+ d_11 -= pA[ii+1+lda*kk] * d_01;
+ pD[ii+0+ldd*(jj+0)] = d_00;
+ pD[ii+1+ldd*(jj+0)] = d_10;
+ pD[ii+0+ldd*(jj+1)] = d_01;
+ pD[ii+1+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = alpha * pB[ii+ldb*(jj+0)];
+ d_01 = alpha * pB[ii+ldb*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
+ }
+ pD[ii+ldd*(jj+0)] = d_00;
+ pD[ii+ldd*(jj+1)] = d_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = alpha * pB[ii+0+ldb*jj];
+ d_10 = alpha * pB[ii+1+ldb*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
+ d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
+ }
+ d_10 -= pA[ii+1+lda*kk] * d_00;
+ pD[ii+0+ldd*jj] = d_00;
+ pD[ii+1+ldd*jj] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = alpha * pB[ii+ldb*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
+ }
+ pD[ii+ldd*jj] = d_00;
+ }
+ }
+#else
+ // copy
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ for(ii=0; ii<m; ii++)
+ pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+ }
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[ii+ldd*jj];
+ for(kk=ii+1; kk<m; kk++)
+ {
+ pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+ }
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, id;
+ REAL
+ d_00, d_01,
+ d_10, d_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda; // triangular
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dA = sA->dA;
+ if(!(sA->use_dA==1 & ai==0 & aj==0))
+ {
+ // inverte diagonal of pA
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0/pA[ii+lda*ii];
+ // use only now
+ sA->use_dA = 0;
+ }
+#if 1
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ id = m-ii-2;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ d_10 = alpha * pB[id+1+ldb*(jj+0)];
+ d_01 = alpha * pB[id+0+ldb*(jj+1)];
+ d_11 = alpha * pB[id+1+ldb*(jj+1)];
+ kk = id+2;
+#if 0
+ for(; kk<m-1; kk+=2)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ d_00 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+ d_11 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+ }
+ if(kk<m)
+#else
+ for(; kk<m; kk++)
+#endif
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ }
+ d_10 *= dA[id+1];
+ d_11 *= dA[id+1];
+ d_00 -= pA[id+0+lda*(id+1)] * d_10;
+ d_01 -= pA[id+0+lda*(id+1)] * d_11;
+ d_00 *= dA[id+0];
+ d_01 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ pD[id+1+ldd*(jj+0)] = d_10;
+ pD[id+0+ldd*(jj+1)] = d_01;
+ pD[id+1+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ id = m-ii-1;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ d_01 = alpha * pB[id+0+ldb*(jj+1)];
+ kk = id+1;
+ for(; kk<m; kk++)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ }
+ d_00 *= dA[id+0];
+ d_01 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ pD[id+0+ldd*(jj+1)] = d_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ id = m-ii-2;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ d_10 = alpha * pB[id+1+ldb*(jj+0)];
+ kk = id+2;
+ for(; kk<m; kk++)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ }
+ d_10 *= dA[id+1];
+ d_00 -= pA[id+0+lda*(id+1)] * d_10;
+ d_00 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ pD[id+1+ldd*(jj+0)] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ id = m-ii-1;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ kk = id+1;
+ for(; kk<m; kk++)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ }
+ d_00 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ }
+ }
+#else
+ // copy
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ for(ii=0; ii<m; ii++)
+ pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+ }
+ // solve
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=m-1; ii>=0; ii--)
+ {
+ d_00 = pD[ii+ldd*jj] * dA[ii];
+ pD[ii+ldd*jj] = d_00;
+ for(kk=0; kk<ii; kk++)
+ {
+ pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+ }
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL
+ f_10,
+ c_00, c_01,
+ c_10, c_11;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ f_10 = pA[jj+1+lda*(jj+0)];
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ pD[ii+1+ldd*(jj+1)] = c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ for(ii=0; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+ldb*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+ }
+ pD[ii+ldd*jj] = c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dA = sA->dA;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ f_00_inv = dA[jj+0];
+ f_10 = pA[jj+1+lda*(jj+0)];
+ f_11_inv = dA[jj+1];
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ c_01 *= f_11_inv;
+ c_11 *= f_11_inv;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ pD[ii+1+ldd*(jj+1)] = c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ c_01 *= f_11_inv;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ f_00_inv = dA[jj];
+ for(ii=0; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+ldb*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+ldd*jj] = c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+ printf("\ndtrsm_rutn_libstr: feature not implemented yet\n");
+ exit(1);
+// if(!(pB==pD))
+// {
+// for(jj=0; jj<n; jj++)
+// COPY(&m, pB+jj*sB->m, &i1, pD+jj*sD->m, &i1);
+// }
+// TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &(sA->m), pD, &(sD->m));
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+ c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ c_01 = 0.0; ;
+ c_11 = 0.0; ;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+ c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ c_01 = 0.0; ;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0)
+ return;
+ int ii, jj, kk;
+ int n = m; // TODO optimize for this case !!!!!!!!!
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // diagonal
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+ pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+ pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+ // lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // diagonal
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+ // lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // diagonal
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+ pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+ pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+ // lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // diagonal
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+ // lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cn = 'n';
+ char ct = 't';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cn = 'n';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &cn, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cl, &cl, &cn, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
+ }
+ TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cl, &cu, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cl, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA + ai + aj*sA->m;
+ REAL *pB = sB->pA + bi + bj*sB->m;
+ REAL *pC = sC->pA + ci + cj*sC->m;
+ REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long kk = k;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &mm, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &mm, &mm, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#endif
+ return;
+ }
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA + ai + aj*sA->m;
+ REAL *pB = sB->pA + bi + bj*sB->m;
+ REAL *pC = sC->pA + ci + cj*sC->m;
+ REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long mmn = mm-nn;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &nn, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &nn, &kk, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#else
+ int i1 = 1;
+ int mmn = m-n;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#endif
+ return;
+ }
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+
diff --git a/blas/x_lapack_lib.c b/blas/x_lapack_lib.c
new file mode 100644
index 0000000..762a8a0
--- /dev/null
+++ b/blas/x_lapack_lib.c
@@ -0,0 +1,2112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ jj = 0;
+ for(; jj<m-1; jj+=2)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+0+ldc*(jj+0)];;
+ c_10 = pC[jj+1+ldc*(jj+0)];;
+ c_11 = pC[jj+1+ldc*(jj+1)];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj+0] = f_00_inv;
+ pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+ f_10 = c_10 * f_00_inv;
+ pD[jj+1+ldd*(jj+0)] = f_10;
+ c_11 -= f_10 * f_10;
+ if(c_11>0)
+ {
+ f_11_inv = 1.0/sqrt(c_11);
+ }
+ else
+ {
+ f_11_inv = 0.0;
+ }
+ dD[jj+1] = f_11_inv;
+ pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ // solve lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_10 = pC[ii+1+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ c_11 = pC[ii+1+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ }
+ }
+ for(; jj<m; jj++)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+ldc*jj];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj] = f_00_inv;
+ pD[jj+ldd*jj] = c_00 * f_00_inv;
+ // solve lower
+// for(ii=jj+1; ii<m; ii++)
+// {
+// c_00 = pC[ii+ldc*jj];
+// for(kk=0; kk<jj; kk++)
+// {
+// c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+// }
+// pD[ii+ldd*jj] = c_00 * f_00_inv;
+// }
+ }
+ return;
+ }
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+0+ldc*(jj+0)];;
+ c_10 = pC[jj+1+ldc*(jj+0)];;
+ c_11 = pC[jj+1+ldc*(jj+1)];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj+0] = f_00_inv;
+ pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+ f_10 = c_10 * f_00_inv;
+ pD[jj+1+ldd*(jj+0)] = f_10;
+ c_11 -= f_10 * f_10;
+ if(c_11>0)
+ {
+ f_11_inv = 1.0/sqrt(c_11);
+ }
+ else
+ {
+ f_11_inv = 0.0;
+ }
+ dD[jj+1] = f_11_inv;
+ pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ // solve lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_10 = pC[ii+1+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ c_11 = pC[ii+1+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+ldc*jj];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj] = f_00_inv;
+ pD[jj+ldd*jj] = c_00 * f_00_inv;
+ // solve lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = pC[ii+ldc*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+ }
+ pD[ii+ldd*jj] = c_00 * f_00_inv;
+ }
+ }
+ return;
+ }
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int ii, jj, kk;
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+0+ldc*(jj+0)];;
+ c_10 = pC[jj+1+ldc*(jj+0)];;
+ c_11 = pC[jj+1+ldc*(jj+1)];;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj+0] = f_00_inv;
+ pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+ f_10 = c_10 * f_00_inv;
+ pD[jj+1+ldd*(jj+0)] = f_10;
+ c_11 -= f_10 * f_10;
+ if(c_11>0)
+ {
+ f_11_inv = 1.0/sqrt(c_11);
+ }
+ else
+ {
+ f_11_inv = 0.0;
+ }
+ dD[jj+1] = f_11_inv;
+ pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ // solve lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_10 = pC[ii+1+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ c_11 = pC[ii+1+ldc*(jj+1)];
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+ldc*jj];;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj] = f_00_inv;
+ pD[jj+ldd*jj] = c_00 * f_00_inv;
+ // solve lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = pC[ii+ldc*jj];
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+ }
+ pD[ii+ldd*jj] = c_00 * f_00_inv;
+ }
+ }
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+void GETF2_NOPIVOT(int m, int n, REAL *A, int lda, REAL *dA)
+ {
+ int ii, jj, kk, itmp0, itmp1;
+ int iimax = m<n ? m : n;
+ int i1 = 1;
+ REAL dtmp;
+ REAL dm1 = -1.0;
+
+ for(ii=0; ii<iimax; ii++)
+ {
+ itmp0 = m-ii-1;
+ dtmp = 1.0/A[ii+lda*ii];
+ dA[ii] = dtmp;
+ for(jj=0; jj<itmp0; jj++)
+ {
+ A[ii+1+jj+lda*ii] *= dtmp;
+ }
+ itmp1 = n-ii-1;
+ for(jj=0; jj<itmp1; jj++)
+ {
+ for(kk=0; kk<itmp0; kk++)
+ {
+ A[(ii+1+kk)+lda*(ii+1+jj)] -= A[(ii+1+kk)+lda*ii] * A[ii+lda*(ii+1+jj)];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int ii, jj, kk;
+// int i1 = 1;
+// REAL d1 = 1.0;
+ REAL
+ d_00_inv, d_11_inv,
+ d_00, d_01,
+ d_10, d_11;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+#if 1
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // upper
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_10 = pC[(ii+1)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ d_11 = pC[(ii+1)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // diagonal
+ ii = jj;
+ if(ii<m-1)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_10 = pC[(ii+1)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ d_11 = pC[(ii+1)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ d_10 *= d_00_inv;
+ d_11 -= d_10 * d_01;
+ d_11_inv = 1.0/d_11;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ dD[ii+0] = d_00_inv;
+ dD[ii+1] = d_11_inv;
+ ii += 2;
+ }
+ else if(ii<m)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ dD[ii+0] = d_00_inv;
+ ii += 1;
+ }
+ // lower
+ for(; ii<m-1; ii+=2)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_10 = pC[(ii+1)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ d_11 = pC[(ii+1)+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ d_10 *= d_00_inv;
+ d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+ d_11 -= d_10 * pD[kk+ldd*(jj+1)];
+ d_01 *= d_11_inv;
+ d_11 *= d_11_inv;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+ d_01 *= d_11_inv;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // upper
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*jj];
+ d_10 = pC[(ii+1)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ pD[(ii+0)+ldd*jj] = d_00;
+ }
+ // diagonal
+ ii = jj;
+ if(ii<m-1)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*jj];
+ d_10 = pC[(ii+1)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ d_10 *= d_00_inv;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ dD[ii+0] = d_00_inv;
+ ii += 2;
+ }
+ else if(ii<m)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ pD[(ii+0)+ldd*jj] = d_00;
+ dD[ii+0] = d_00_inv;
+ ii += 1;
+ }
+ // lower
+ for(; ii<m-1; ii+=2)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*jj];
+ d_10 = pC[(ii+1)+ldc*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ d_10 *= d_00_inv;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ pD[(ii+0)+ldd*jj] = d_00;
+ }
+ }
+#else
+ if(pC!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pC[ii+ldc*jj];
+ }
+ }
+ }
+ GETF2_NOPIVOT(m, n, pD, ldd, dD);
+#endif
+ return;
+ }
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+ {
+ int ii, i0, jj, kk, ip, itmp0, itmp1;
+ REAL dtmp, dmax;
+ REAL
+ d_00_inv, d_11_inv,
+ d_00, d_01,
+ d_10, d_11;
+ int i1 = 1;
+ REAL d1 = 1.0;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA+ci+cj*ldc;
+ REAL *pD = sD->pA+di+dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ // copy if needed
+ if(pC!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pC[ii+ldc*jj];
+ }
+ }
+ }
+ // factorize
+#if 1
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_10 = pD[(ii+1)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_11 = pD[(ii+1)+ldd*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // correct diagonal and lower and look for pivot
+ // correct
+ ii = jj;
+ i0 = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_10 = pD[(ii+1)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_11 = pD[(ii+1)+ldd*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // look for pivot & solve
+ // left column
+ ii = i0;
+ dmax = 0;
+ ip = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ d_10 = pD[(ii+1)+ldd*jj];
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ dtmp = d_10>0.0 ? d_10 : -d_10;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+1;
+ }
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ }
+ // row swap
+ ii = i0;
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(kk=0; kk<n; kk++)
+ {
+ dtmp = pD[ii+ldd*kk];
+ pD[ii+ldd*kk] = pD[ip+ldd*kk];
+ pD[ip+ldd*kk] = dtmp;
+ }
+ }
+ // factorize diagonal
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_00_inv = 1.0/d_00;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ dD[ii] = d_00_inv;
+ ii += 1;
+ // solve & compute next pivot
+ dmax = 0;
+ ip = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_10 = pD[(ii+1)+ldd*(jj+0)];
+ d_00 *= d_00_inv;
+ d_10 *= d_00_inv;
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_11 = pD[(ii+1)+ldd*(jj+1)];
+ d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+ d_11 -= d_10 * pD[jj+ldd*(jj+1)];
+ dtmp = d_01>0.0 ? d_01 : -d_01;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ dtmp = d_11>0.0 ? d_11 : -d_11;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+1;
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_00 *= d_00_inv;
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+ dtmp = d_01>0.0 ? d_01 : -d_01;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // row swap
+ ii = i0+1;
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(kk=0; kk<n; kk++)
+ {
+ dtmp = pD[ii+ldd*kk];
+ pD[ii+ldd*kk] = pD[ip+ldd*kk];
+ pD[ip+ldd*kk] = dtmp;
+ }
+ }
+ // factorize diagonal
+ d_00 = pD[ii+ldd*(jj+1)];
+ d_00_inv = 1.0/d_00;
+ pD[ii+ldd*(jj+1)] = d_00;
+ dD[ii] = d_00_inv;
+ ii += 1;
+ // solve lower
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[ii+ldd*(jj+1)];
+ d_00 *= d_00_inv;
+ pD[ii+ldd*(jj+1)] = d_00;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pD[(ii+0)+ldd*jj];
+ d_10 = pD[(ii+1)+ldd*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pD[ii+ldd*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[ii+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ pD[ii+ldd*jj] = d_00;
+ }
+ i0 = ii;
+ ii = jj;
+ // correct diagonal and lower and look for pivot
+ dmax = 0;
+ ip = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ d_10 = pD[(ii+1)+ldd*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ dtmp = d_10>0.0 ? d_10 : -d_10;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+1;
+ }
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ pD[(ii+0)+ldd*jj] = d_00;
+ }
+ // row swap
+ ii = i0;
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(kk=0; kk<n; kk++)
+ {
+ dtmp = pD[ii+ldd*kk];
+ pD[ii+ldd*kk] = pD[ip+ldd*kk];
+ pD[ip+ldd*kk] = dtmp;
+ }
+ }
+ // factorize diagonal
+ d_00 = pD[ii+ldd*jj];
+ d_00_inv = 1.0/d_00;
+ pD[ii+ldd*jj] = d_00;
+ dD[ii] = d_00_inv;
+ ii += 1;
+ for(; ii<m; ii++)
+ {
+ // correct lower
+ d_00 = pD[ii+ldd*jj];
+ // solve lower
+ d_00 *= d_00_inv;
+ pD[ii+ldd*jj] = d_00;
+ }
+ }
+#else
+ int iimax = m<n ? m : n;
+ for(ii=0; ii<iimax; ii++)
+ {
+ dmax = (pD[ii+ldd*ii]>0 ? pD[ii+ldd*ii] : -pD[ii+ldd*ii]);
+ ip = ii;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ dtmp = pD[ii+jj+ldd*ii]>0 ? pD[ii+jj+ldd*ii] : -pD[ii+jj+ldd*ii];
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+jj;
+ }
+ }
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ dtmp = pD[ii+jj*ldd];
+ pD[ii+jj*ldd] = pD[ip+jj*ldd];
+ pD[ip+jj*ldd] = dtmp;
+ }
+ }
+ itmp0 = m-ii-1;
+ dtmp = 1.0/pD[ii+ldd*ii];
+ dD[ii] = dtmp;
+ for(jj=0; jj<itmp0; jj++)
+ {
+ pD[ii+1+jj+ldd*ii] *= dtmp;
+ }
+ itmp1 = n-ii-1;
+ for(jj=0; jj<itmp1; jj++)
+ {
+ for(kk=0; kk<itmp0; kk++)
+ {
+ pD[(ii+1+kk)+ldd*(ii+1+jj)] -= pD[(ii+1+kk)+ldd*ii] * pD[ii+ldd*(ii+1+jj)];
+ }
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ return 0;
+ }
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA+ai+aj*lda;
+ REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+ REAL *dD = sD->dA+di; // vectors of tau
+ REAL alpha, beta, tmp, w0, w1;
+ REAL *pC00, *pC01, *pC11, *pv0, *pv1;
+ REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldw = 2;
+ REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldb = 2;
+ int imax, jmax, kmax;
+ // copy if needed
+ if(pA!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pA[ii+lda*jj];
+ }
+ }
+ }
+ imax = m<n ? m : n;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first column
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ tmp = pC00[jj+ldd*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau0
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pC00[0+ldd*0] = beta;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ pC00[jj+ldd*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ pC01 = &pC00[0+ldd*1];
+ pv0 = &pC00[0+ldd*0];
+ kmax = m-ii;
+ w0 = pC01[0+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC01[kk+ldd*0] * pv0[kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ldd*0] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC01[kk+ldd*0] += w0 * pv0[kk];
+ }
+ // second column
+ pC11 = &pD[(ii+1)+ldd*(ii+1)];
+ beta = 0.0;
+ for(jj=1; jj<m-(ii+1); jj++)
+ {
+ tmp = pC11[jj+ldd*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau1
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau1
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v1
+ pC11[0+ldd*0] = beta;
+ for(jj=1; jj<m-(ii+1); jj++)
+ pC11[jj+ldd*0] *= tmp;
+ }
+ // compute lower triangular T containing tau for matrix update
+ pv0 = &pC00[0+ldd*0];
+ pv1 = &pC00[0+ldd*1];
+ kmax = m-ii;
+ tmp = pv0[1];
+ for(kk=2; kk<kmax; kk++)
+ tmp += pv0[kk]*pv1[kk];
+ pT[0+ldb*0] = dD[ii+0];
+ pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldb*1] = dD[ii+1];
+ jmax = n-ii-2;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ // compute W^T = C^T * V
+ pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+ pW[1+ldw*0] = pC00[0+ldd*(jj+1+2)] + pC00[1+ldd*(jj+1+2)] * pv0[1];
+ pW[0+ldw*1] = pC00[1+ldd*(jj+0+2)];
+ pW[1+ldw*1] = pC00[1+ldd*(jj+1+2)];
+ kk = 2;
+ for(; kk<kmax; kk++)
+ {
+ tmp = pC00[kk+ldd*(jj+0+2)];
+ pW[0+ldw*0] += tmp * pv0[kk];
+ pW[0+ldw*1] += tmp * pv1[kk];
+ tmp = pC00[kk+ldd*(jj+1+2)];
+ pW[1+ldw*0] += tmp * pv0[kk];
+ pW[1+ldw*1] += tmp * pv1[kk];
+ }
+ // compute W^T *= T
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+ pC00[0+ldd*(jj+1+2)] -= pW[1+ldw*0];
+ pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC00[1+ldd*(jj+1+2)] -= pv0[1]*pW[1+ldw*0] + pW[1+ldw*1];
+ kk = 2;
+ for(; kk<kmax-1; kk+=2)
+ {
+ pC00[kk+0+ldd*(jj+0+2)] -= pv0[kk+0]*pW[0+ldw*0] + pv1[kk+0]*pW[0+ldw*1];
+ pC00[kk+1+ldd*(jj+0+2)] -= pv0[kk+1]*pW[0+ldw*0] + pv1[kk+1]*pW[0+ldw*1];
+ pC00[kk+0+ldd*(jj+1+2)] -= pv0[kk+0]*pW[1+ldw*0] + pv1[kk+0]*pW[1+ldw*1];
+ pC00[kk+1+ldd*(jj+1+2)] -= pv0[kk+1]*pW[1+ldw*0] + pv1[kk+1]*pW[1+ldw*1];
+ }
+ for(; kk<kmax; kk++)
+ {
+ pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+ pC00[kk+ldd*(jj+1+2)] -= pv0[kk]*pW[1+ldw*0] + pv1[kk]*pW[1+ldw*1];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ // compute W = T * V^T * C
+ pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+ pW[0+ldw*1] = pC00[1+ldd*(jj+0+2)];
+ for(kk=2; kk<kmax; kk++)
+ {
+ tmp = pC00[kk+ldd*(jj+0+2)];
+ pW[0+ldw*0] += tmp * pv0[kk];
+ pW[0+ldw*1] += tmp * pv1[kk];
+ }
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+ pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+ }
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ tmp = pC00[jj+ldd*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ for(jj=1; jj<m-ii; jj++)
+ pC00[jj+ldd*0] *= tmp;
+ pC00[0+ldd*0] = beta;
+ }
+ if(ii<n)
+ {
+ // gemv_t & ger
+ pC01 = &pC00[0+ldd*1];
+ pv0 = &pC00[0+ldd*0];
+ jmax = n-ii-1;
+ kmax = m-ii;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ w0 = pC01[0+ldd*(jj+0)]; // pv0[0] = 1.0
+ w1 = pC01[0+ldd*(jj+1)]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC01[kk+ldd*(jj+0)] * pv0[kk];
+ w1 += pC01[kk+ldd*(jj+1)] * pv0[kk];
+ }
+ w0 = - dD[ii] * w0;
+ w1 = - dD[ii] * w1;
+ pC01[0+ldd*(jj+0)] += w0; // pv0[0] = 1.0
+ pC01[0+ldd*(jj+1)] += w1; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC01[kk+ldd*(jj+0)] += w0 * pv0[kk];
+ pC01[kk+ldd*(jj+1)] += w1 * pv0[kk];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ldd*jj]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC01[kk+ldd*jj] * pv0[kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ldd*jj] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC01[kk+ldd*jj] += w0 * pv0[kk];
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ return 0;
+ }
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA+ai+aj*lda;
+ REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+ REAL *dD = sD->dA+di; // vectors of tau
+ REAL alpha, beta, tmp, w0, w1;
+ REAL *pC00, *pC10, *pC11, *pv0, *pv1;
+ REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldw = 2;
+ REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldb = 2;
+ int imax, jmax, kmax;
+ // copy if needed
+ if(pA!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pA[ii+lda*jj];
+ }
+ }
+ }
+ imax = m<n ? m : n;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first column
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ldd*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau0
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pC00[0+ldd*0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ pC00[0+ldd*jj] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ pC10 = &pC00[1+ldd*0];
+ pv0 = &pC00[0+ldd*0];
+ kmax = n-ii;
+ w0 = pC10[0+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC10[0+ldd*kk] * pv0[0+ldd*kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC10[0+ldd*0] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ldd*kk] += w0 * pv0[0+ldd*kk];
+ }
+ // second row
+ pC11 = &pD[(ii+1)+ldd*(ii+1)];
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ldd*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau1
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau1
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v1
+ pC11[0+ldd*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ldd*jj] *= tmp;
+ }
+ // compute lower triangular T containing tau for matrix update
+ pv0 = &pC00[0+ldd*0];
+ pv1 = &pC00[1+ldd*0];
+ kmax = n-ii;
+ tmp = pv0[0+ldd*1];
+ for(kk=2; kk<kmax; kk++)
+ tmp += pv0[0+ldd*kk]*pv1[0+ldd*kk];
+ pT[0+ldb*0] = dD[ii+0];
+ pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldb*1] = dD[ii+1];
+ // downgrade
+ jmax = m-ii-2;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ // compute W^T = C^T * V
+ pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+ pW[1+ldw*0] = pC00[jj+1+2+ldd*0] + pC00[jj+1+2+ldd*1] * pv0[0+ldd*1];
+ pW[0+ldw*1] = pC00[jj+0+2+ldd*1];
+ pW[1+ldw*1] = pC00[jj+1+2+ldd*1];
+ kk = 2;
+ for(; kk<kmax; kk++)
+ {
+ tmp = pC00[jj+0+2+ldd*kk];
+ pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+ pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+ tmp = pC00[jj+1+2+ldd*kk];
+ pW[1+ldw*0] += tmp * pv0[0+ldd*kk];
+ pW[1+ldw*1] += tmp * pv1[0+ldd*kk];
+ }
+ // compute W^T *= T
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+ pC00[jj+1+2+ldd*0] -= pW[1+ldw*0];
+ pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC00[jj+1+2+ldd*1] -= pv0[0+ldd*1]*pW[1+ldw*0] + pW[1+ldw*1];
+ kk = 2;
+ for(; kk<kmax-1; kk+=2)
+ {
+ pC00[jj+0+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[0+ldw*0] + pv1[0+ldd*(kk+0)]*pW[0+ldw*1];
+ pC00[jj+0+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[0+ldw*0] + pv1[0+ldd*(kk+1)]*pW[0+ldw*1];
+ pC00[jj+1+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[1+ldw*0] + pv1[0+ldd*(kk+0)]*pW[1+ldw*1];
+ pC00[jj+1+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[1+ldw*0] + pv1[0+ldd*(kk+1)]*pW[1+ldw*1];
+ }
+ for(; kk<kmax; kk++)
+ {
+ pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+ pC00[jj+1+2+ldd*kk] -= pv0[0+ldd*kk]*pW[1+ldw*0] + pv1[0+ldd*kk]*pW[1+ldw*1];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ // compute W = T * V^T * C
+ pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+ pW[0+ldw*1] = pC00[jj+0+2+ldd*1];
+ for(kk=2; kk<kmax; kk++)
+ {
+ tmp = pC00[jj+0+2+ldd*kk];
+ pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+ pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+ }
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+ pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+ }
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ldd*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ldd*jj] *= tmp;
+ pC00[0+ldd*0] = beta;
+ }
+ if(ii<n)
+ {
+ // gemv_t & ger
+ pC10 = &pC00[1+ldd*0];
+ pv0 = &pC00[0+ldd*0];
+ jmax = m-ii-1;
+ kmax = n-ii;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ w0 = pC10[jj+0+ldd*0]; // pv0[0] = 1.0
+ w1 = pC10[jj+1+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC10[jj+0+ldd*kk] * pv0[0+ldd*kk];
+ w1 += pC10[jj+1+ldd*kk] * pv0[0+ldd*kk];
+ }
+ w0 = - dD[ii] * w0;
+ w1 = - dD[ii] * w1;
+ pC10[jj+0+ldd*0] += w0; // pv0[0] = 1.0
+ pC10[jj+1+ldd*0] += w1; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[jj+0+ldd*kk] += w0 * pv0[0+ldd*kk];
+ pC10[jj+1+ldd*kk] += w1 * pv0[0+ldd*kk];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ w0 = pC10[jj+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC10[jj+ldd*kk] * pv0[0+ldd*kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC10[jj+ldd*0] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[jj+ldd*kk] += w0 * pv0[0+ldd*kk];
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0)
+ return;
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long info;
+ long long tmp;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &mm, pD, &ldd, &info);
+#else
+ int i1 = 1;
+ int info;
+ int tmp;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &m, pD, &ldd, &info);
+#endif
+ return;
+ }
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long mmn = mm-nn;
+ long long info;
+ long long tmp;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &nn, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+#else
+ int i1 = 1;
+ int mmn = m-n;
+ int info;
+ int tmp;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &n, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL d1 = 1.0;
+ REAL *pA = sA->pA + ai + aj*sA->m;
+ REAL *pB = sB->pA + bi + bj*sB->m;
+ REAL *pC = sC->pA + ci + cj*sC->m;
+ REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long mmn = mm-nn;
+ long long info;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &nn, &kk, &d1, pA, &lda, &d1, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &nn, &kk, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+ POTRF(&cl, &nn, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &mm, &nn, &kk, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+ POTRF(&cl, &nn, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+ }
+#else
+ int i1 = 1;
+ int mmn = m-n;
+ int info;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &n, &k, &d1, pA, &lda, &d1, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &n, &k, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+ POTRF(&cl, &n, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &m, &n, &k, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+ POTRF(&cl, &n, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+ }
+#endif
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+#if defined(REF_BLAS_BLIS)
+static void GETF2_NOPIVOT(long long m, long long n, REAL *A, long long lda)
+ {
+ if(m<=0 | n<=0)
+ return;
+ long long i, j;
+ long long jmax = m<n ? m : n;
+ REAL dtmp;
+ REAL dm1 = -1.0;
+ long long itmp0, itmp1;
+ long long i1 = 1;
+ for(j=0; j<jmax; j++)
+ {
+ itmp0 = m-j-1;
+ dtmp = 1.0/A[j+lda*j];
+ SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+ itmp1 = n-j-1;
+ GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+ }
+ return;
+ }
+#else
+static void GETF2_NOPIVOT(int m, int n, REAL *A, int lda)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int i, j;
+ int jmax = m<n ? m : n;
+ REAL dtmp;
+ REAL dm1 = -1.0;
+ int itmp0, itmp1;
+ int i1 = 1;
+ for(j=0; j<jmax; j++)
+ {
+ itmp0 = m-j-1;
+ dtmp = 1.0/A[j+lda*j];
+ SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+ itmp1 = n-j-1;
+ GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+ }
+ return;
+ }
+#endif
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ // TODO with custom level 2 LAPACK + level 3 BLAS
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETF2_NOPIVOT(mm, nn, pD, ldd);
+#else
+ int i1 = 1;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETF2_NOPIVOT(m, n, pD, ldd);
+#endif
+ return;
+ }
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+ {
+ // TODO with custom level 2 LAPACK + level 3 BLAS
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ int tmp = m<n ? m : n;
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long info;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETRF(&mm, &nn, pD, &ldd, (long long *) ipiv, &info);
+ for(jj=0; jj<tmp; jj++)
+ ipiv[jj] -= 1;
+#else
+ int i1 = 1;
+ int info;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETRF(&m, &n, pD, &ldd, ipiv, &info);
+ for(jj=0; jj<tmp; jj++)
+ ipiv[jj] -= 1;
+#endif
+ return;
+ }
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ REAL dwork;
+ REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+ long long mm = m;
+ long long nn = n;
+ long long lwork = -1;
+ long long info;
+ long long ldd = mm;
+ GEQRF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+ int lwork = -1;
+ int info;
+ int ldd = m;
+ GEQRF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+ int size = dwork;
+ return size*sizeof(REAL);
+ }
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+ REAL *dD = sD->dA+di;
+ REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long info = -1;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+ long long lwork = -1;
+ GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+ int i1 = 1;
+ int info = -1;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+ int lwork = -1;
+ GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+ return;
+ }
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ REAL dwork;
+ REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+ long long mm = m;
+ long long nn = n;
+ long long lwork = -1;
+ long long info;
+ long long ldd = mm;
+ GELQF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+ int lwork = -1;
+ int info;
+ int ldd = m;
+ GELQF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+ int size = dwork;
+ return size*sizeof(REAL);
+ }
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+ REAL *dD = sD->dA+di;
+ REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long info = -1;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+ long long lwork = -1;
+ GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+ int i1 = 1;
+ int info = -1;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+ int lwork = -1;
+ GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blasfeo_target.h.in b/blasfeo_target.h.in
new file mode 100644
index 0000000..a98ac81
--- /dev/null
+++ b/blasfeo_target.h.in
@@ -0,0 +1,11 @@
+#ifndef TARGET_@TARGET@
+#define TARGET_@TARGET@
+#endif
+
+#ifndef LA_@LA@
+#define LA_@LA@
+#endif
+
+#ifndef EXT_DEP
+#cmakedefine EXT_DEP @EXT_DEP@
+#endif
diff --git a/doc/guide.pdf b/doc/guide.pdf
new file mode 100644
index 0000000..9f81df3
--- /dev/null
+++ b/doc/guide.pdf
Binary files differ
diff --git a/doc/guide.tex b/doc/guide.tex
new file mode 100644
index 0000000..626eaa4
--- /dev/null
+++ b/doc/guide.tex
@@ -0,0 +1,149 @@
+\documentclass[a4paper]{report}
+
+\usepackage[margin=3.0cm]{geometry}
+\usepackage{amsmath}
+\usepackage[pdftex]{graphicx}
+%\usepackage{graphics}
+\usepackage{subfig}
+
+
+
+\title{BLASFEO reference guide}
+\author{Gianluca Frison}
+
+
+
+\begin{document}
+
+\maketitle
+\tableofcontents
+
+
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Introduction}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+BLASFEO - BLAS For Embedded Optimization.
+
+
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Matrix data type}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+The fundamental data type in BLASFEO is a C struct defining a matrix, called {\tt strmat}.
+Depending on the chosen linear algebra library, the struct is defined differently.
+
+
+
+\section{{\tt strmat} definition}
+
+
+\subsection{BLASFEO}
+
+\begin{verbatim}
+struct d_strmat
+ {
+ int bs;
+ int m;
+ int n;
+ int pm;
+ int cn;
+ double *pA;
+ double *dA;
+ int use_dA;
+ int memory_size;
+ };
+\end{verbatim}
+where the struct members are
+\begin{description}
+\item[bs] height of the panel
+\item[m] number of rows
+\item[n] number of columns
+\item[pm] number of rows of the matrix as allocated in memory, used for memory alignment
+\item[cn] number of rows of the matrix as allocated in memory, used for memory alignment
+\item[pA] pointer to a pm$\times$pn array of doubles, the first element is aligned to cache line size
+\item[dA] pointer to a min(m,n) array of doubles, used e.g. to store the inverse of the diagonal of the matrix
+\item[use\_dA] flag to tell if dA contains useful information
+\item[memory\_size] size of the memory (in bytes) needed for pA and pD
+\end{description}
+
+
+\subsection{BLAS}
+
+\begin{verbatim}
+struct d_strmat
+ {
+ int m; // rows
+ int n; // cols
+ double *pA; // pointer to a m*n array of doubles
+ int memory_size; // size of needed memory
+ };
+\end{verbatim}
+\begin{description}
+\item[m] number of rows
+\item[n] number of columns
+\item[pA] pointer to a m$\times$n array of doubles
+\item[memory\_size] size of the memory (in bytes) needed for pA
+\end{description}
+
+
+
+\section{{\tt strmat} management}
+
+\begin{verbatim}
+void d_allocate_strmat(int m, int n, struct d_strmat *sA);
+\end{verbatim}
+
+\begin{verbatim}
+void d_free_strmat(struct d_strmat *sA);
+\end{verbatim}
+
+\begin{verbatim}
+int d_size_strmat(int m, int n);
+\end{verbatim}
+
+\begin{verbatim}
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory);
+\end{verbatim}
+
+
+
+\section{{\tt strmat} conversion}
+
+\begin{verbatim}
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA,
+ int ai, int aj);
+\end{verbatim}
+
+\begin{verbatim}
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA,
+ int ai, int aj);
+\end{verbatim}
+
+\begin{verbatim}
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj,
+ double *A, int lda);
+\end{verbatim}
+
+\begin{verbatim}
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj,
+ double *A, int lda);
+\end{verbatim}
+
+
+
+\section{{\tt strmat} print}
+
+\begin{verbatim}
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj);
+\end{verbatim}
+
+
+
+\end{document}
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..7204cba
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,69 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = -lblis -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+
+ifneq ($(NUM_THREAD), 1)
+LIBS += -pthread
+endif
+
+#OBJS_TEST = example_d_lu_factorization.o
+#OBJS_TEST = example_s_lu_factorization.o
+OBJS_TEST = tools.o example_d_riccati_recursion.o
+#OBJS_TEST = tools.o example_s_riccati_recursion.o
+
+all: clean obj run
+
+obj: $(OBJS_TEST)
+ cp ../libblasfeo.a .
+ $(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+run:
+ ./test.out
+
+clean:
+ rm -f *.o
+ rm -f test.out
+ rm -f libblasfeo.a
+
diff --git a/examples/example_d_lu_factorization.c b/examples/example_d_lu_factorization.c
new file mode 100644
index 0000000..62b3413
--- /dev/null
+++ b/examples/example_d_lu_factorization.c
@@ -0,0 +1,210 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+
+ double *A; d_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// d_print_mat(n, n, A, n);
+
+ // spd matrix
+ double *B; d_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// d_print_mat(n, n, B, n);
+
+ // identity
+ double *I; d_zeros(&I, n, n);
+ for(ii=0; ii<n; ii++) I[ii*(n+1)] = 1.0;
+// d_print_mat(n, n, B, n);
+
+ // result matrix
+ double *D; d_zeros(&D, n, n);
+// d_print_mat(n, n, D, n);
+
+ // permutation indeces
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ //
+ // matrices in matrix struct format
+ //
+
+ // work space enough for 5 matrix structs for size n times n
+ int size_strmat = 5*d_size_strmat(n, n);
+ void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+ char *ptr_memory_strmat = (char *) memory_strmat;
+
+ struct d_strmat sA;
+// d_allocate_strmat(n, n, &sA);
+ d_create_strmat(n, n, &sA, ptr_memory_strmat);
+ ptr_memory_strmat += sA.memory_size;
+ // convert from column major matrix to strmat
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ printf("\nA = \n");
+ d_print_strmat(n, n, &sA, 0, 0);
+
+ struct d_strmat sB;
+// d_allocate_strmat(n, n, &sB);
+ d_create_strmat(n, n, &sB, ptr_memory_strmat);
+ ptr_memory_strmat += sB.memory_size;
+ // convert from column major matrix to strmat
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ printf("\nB = \n");
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ struct d_strmat sI;
+// d_allocate_strmat(n, n, &sI);
+ d_create_strmat(n, n, &sI, ptr_memory_strmat);
+ ptr_memory_strmat += sI.memory_size;
+ // convert from column major matrix to strmat
+
+ struct d_strmat sD;
+// d_allocate_strmat(n, n, &sD);
+ d_create_strmat(n, n, &sD, ptr_memory_strmat);
+ ptr_memory_strmat += sD.memory_size;
+
+ struct d_strmat sLU;
+// d_allocate_strmat(n, n, &sD);
+ d_create_strmat(n, n, &sLU, ptr_memory_strmat);
+ ptr_memory_strmat += sLU.memory_size;
+
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nB+A*A' = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+
+// dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+ dgetrf_libstr(n, n, &sD, 0, 0, &sLU, 0, 0, ipiv);
+ printf("\nLU = \n");
+ d_print_strmat(n, n, &sLU, 0, 0);
+ printf("\nipiv = \n");
+ int_print_mat(1, n, ipiv, 1);
+
+#if 0 // solve P L U X = P B
+ d_cvt_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI = \n");
+ d_print_strmat(n, n, &sI, 0, 0);
+
+ drowpe_libstr(n, ipiv, &sI);
+ printf("\nperm(I) = \n");
+ d_print_strmat(n, n, &sI, 0, 0);
+
+ dtrsm_llnu_libstr(n, n, 1.0, &sLU, 0, 0, &sI, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L)) = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+ dtrsm_lunn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A) = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ d_cvt_strmat2mat(n, n, &sD, 0, 0, D, n);
+#else // solve X^T (P L U)^T = B^T P^T
+ d_cvt_tran_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI' = \n");
+ d_print_strmat(n, n, &sI, 0, 0);
+
+ dcolpe_libstr(n, ipiv, &sB);
+ printf("\nperm(I') = \n");
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ dtrsm_rltu_libstr(n, n, 1.0, &sLU, 0, 0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L')) = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+ dtrsm_rutn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A') = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ d_cvt_tran_strmat2mat(n, n, &sD, 0, 0, D, n);
+#endif
+
+ // print matrix in column-major format
+ printf("\ninv(A) = \n");
+ d_print_mat(n, n, D, n);
+
+
+
+ //
+ // free memory
+ //
+
+ d_free(A);
+ d_free(B);
+ d_free(D);
+ d_free(I);
+ int_free(ipiv);
+// d_free_strmat(&sA);
+// d_free_strmat(&sB);
+// d_free_strmat(&sD);
+// d_free_strmat(&sI);
+ v_free_align(memory_strmat);
+
+ return 0;
+
+ }
diff --git a/examples/example_d_riccati_recursion.c b/examples/example_d_riccati_recursion.c
new file mode 100644
index 0000000..1618ce9
--- /dev/null
+++ b/examples/example_d_riccati_recursion.c
@@ -0,0 +1,595 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+
+static void d_back_ric_sv_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strmat *hswork_mat, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // factorization and backward substitution
+
+ // last stage
+ dpotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ dtrmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+ dgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+ dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ drowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ drowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+static void d_back_ric_trf_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hswork_mat)
+ {
+
+ int nn;
+
+ // factorization
+
+ // last stage
+ dpotrf_l_libstr(nx[N], nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ dtrmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+#if 1
+ dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ return;
+
+ }
+
+
+
+static void d_back_ric_trs_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strvec *hsb, struct d_strvec *hsrq, struct d_strmat *hsL, struct d_strvec *hsPb, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // backward substitution
+
+ // last stage
+ dveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+ // middle stages
+ for(nn=0; nn<N-1; nn++)
+ {
+ // compute Pb
+ dtrmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ }
+
+ // first stage
+ nn = N-1;
+ dtrmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+/************************************************
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts.
+************************************************/
+static void d_mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+ {
+
+ int nx2 = nx*nx;
+
+ int info = 0;
+
+ int pp = nx/2; // number of masses
+
+/************************************************
+* build the continuous time system
+************************************************/
+
+ double *T; d_zeros(&T, pp, pp);
+ int ii;
+ for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+ for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+ for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+ double *Z; d_zeros(&Z, pp, pp);
+ double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+ double *Ac; d_zeros(&Ac, nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac, nx);
+ dmcopy(pp, pp, T, pp, Ac+pp, nx);
+ dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx);
+ free(T);
+ free(Z);
+ free(I);
+
+ d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+ double *Bc; d_zeros(&Bc, nx, nu);
+ dmcopy(nu, nu, I, nu, Bc+pp, nx);
+ free(I);
+
+/************************************************
+* compute the discrete time system
+************************************************/
+
+ double *bb; d_zeros(&bb, nx, 1);
+ dmcopy(nx, 1, bb, nx, b, nx);
+
+ dmcopy(nx, nx, Ac, nx, A, nx);
+ dscal_3l(nx2, Ts, A);
+ expm(nx, A);
+
+ d_zeros(&T, nx, nx);
+ d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+ dmcopy(nx, nx, A, nx, T, nx);
+ daxpy_3l(nx2, -1.0, I, T);
+ dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+ free(T);
+ free(I);
+
+ int *ipiv = (int *) malloc(nx*sizeof(int));
+ dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+ free(ipiv);
+
+ free(Ac);
+ free(Bc);
+ free(bb);
+
+
+/************************************************
+* initial state
+************************************************/
+
+ if(nx==4)
+ {
+ x0[0] = 5;
+ x0[1] = 10;
+ x0[2] = 15;
+ x0[3] = 20;
+ }
+ else
+ {
+ int jj;
+ for(jj=0; jj<nx; jj++)
+ x0[jj] = 1;
+ }
+
+ }
+
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ // loop index
+ int ii;
+
+/************************************************
+* problem size
+************************************************/
+
+ // problem size
+ int N = 4;
+ int nx_ = 4;
+ int nu_ = 1;
+
+ // stage-wise variant size
+ int nx[N+1];
+ nx[0] = 0;
+ for(ii=1; ii<=N; ii++)
+ nx[ii] = nx_;
+ nx[N] = nx_;
+
+ int nu[N+1];
+ for(ii=0; ii<N; ii++)
+ nu[ii] = nu_;
+ nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/
+
+ double *A; d_zeros(&A, nx_, nx_); // states update matrix
+
+ double *B; d_zeros(&B, nx_, nu_); // inputs matrix
+
+ double *b; d_zeros(&b, nx_, 1); // states offset
+ double *x0; d_zeros(&x0, nx_, 1); // initial state
+
+ double Ts = 0.5; // sampling time
+ d_mass_spring_system(Ts, nx_, nu_, N, A, B, b, x0);
+
+ for(ii=0; ii<nx_; ii++)
+ b[ii] = 0.1;
+
+ for(ii=0; ii<nx_; ii++)
+ x0[ii] = 0;
+ x0[0] = 2.5;
+ x0[1] = 2.5;
+
+ d_print_mat(nx_, nx_, A, nx_);
+ d_print_mat(nx_, nu_, B, nx_);
+ d_print_mat(1, nx_, b, 1);
+ d_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/
+
+ double *R; d_zeros(&R, nu_, nu_);
+ for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+ double *S; d_zeros(&S, nu_, nx_);
+
+ double *Q; d_zeros(&Q, nx_, nx_);
+ for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+ double *r; d_zeros(&r, nu_, 1);
+ for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+ double *q; d_zeros(&q, nx_, 1);
+ for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+ d_print_mat(nu_, nu_, R, nu_);
+ d_print_mat(nu_, nx_, S, nu_);
+ d_print_mat(nx_, nx_, Q, nx_);
+ d_print_mat(1, nu_, r, 1);
+ d_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/
+
+ struct d_strmat sA;
+ d_allocate_strmat(nx_, nx_, &sA);
+ d_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+ struct d_strvec sb;
+ d_allocate_strvec(nx_, &sb);
+ d_cvt_vec2strvec(nx_, b, &sb, 0);
+ struct d_strvec sx0;
+ d_allocate_strvec(nx_, &sx0);
+ d_cvt_vec2strvec(nx_, x0, &sx0, 0);
+ struct d_strvec sb0;
+ d_allocate_strvec(nx_, &sb0);
+ double *b0; d_zeros(&b0, nx_, 1); // states offset
+ dgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+ d_print_tran_strvec(nx_, &sb0, 0);
+
+ struct d_strmat sBbt0;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+ d_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+ drowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+ d_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+ struct d_strmat sBAbt1;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+ d_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+ d_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+ d_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+ struct d_strvec sr0; // XXX no need to update r0 since S=0
+ d_allocate_strvec(nu_, &sr0);
+ d_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+ struct d_strmat sRr0;
+ d_allocate_strmat(nu_+1, nu_, &sRr0);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+ drowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+ d_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+ struct d_strvec srq1;
+ d_allocate_strvec(nu_+nx_, &srq1);
+ d_cvt_vec2strvec(nu_, r, &srq1, 0);
+ d_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+ struct d_strmat sRSQrq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+ d_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+ drowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+ struct d_strvec sqN;
+ d_allocate_strvec(nx_, &sqN);
+ d_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+ struct d_strmat sQqN;
+ d_allocate_strmat(nx_+1, nx_, &sQqN);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+ drowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+ d_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/
+
+ struct d_strmat hsBAbt[N];
+ struct d_strvec hsb[N];
+ struct d_strmat hsRSQrq[N+1];
+ struct d_strvec hsrq[N+1];
+ struct d_strmat hsL[N+1];
+ struct d_strvec hsPb[N];
+ struct d_strvec hsux[N+1];
+ struct d_strvec hspi[N];
+ struct d_strmat hswork_mat[1];
+ struct d_strvec hswork_vec[1];
+
+ hsBAbt[0] = sBbt0;
+ hsb[0] = sb0;
+ hsRSQrq[0] = sRr0;
+ hsrq[0] = sr0;
+ d_allocate_strmat(nu_+1, nu_, &hsL[0]);
+ d_allocate_strvec(nx_, &hsPb[0]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[0]);
+ d_allocate_strvec(nx_, &hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ hsBAbt[ii] = sBAbt1;
+ hsb[ii] = sb;
+ hsRSQrq[ii] = sRSQrq1;
+ hsrq[ii] = srq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+ d_allocate_strvec(nx_, &hsPb[ii]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+ d_allocate_strvec(nx_, &hspi[ii]);
+ }
+ hsRSQrq[N] = sQqN;
+ hsrq[N] = sqN;
+ d_allocate_strmat(nx_+1, nx_, &hsL[N]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[N]);
+ d_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+ d_allocate_strvec(nx_, &hswork_vec[0]);
+
+// for(ii=0; ii<N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+// return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/
+
+ // timing
+ struct timeval tv0, tv1, tv2, tv3;
+ int nrep = 1000;
+ int rep;
+
+ gettimeofday(&tv0, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsux, hspi, hswork_mat, hswork_vec);
+ }
+
+ gettimeofday(&tv1, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hswork_mat);
+ }
+
+ gettimeofday(&tv2, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsPb, hsux, hspi, hswork_vec);
+ }
+
+ gettimeofday(&tv3, NULL); // time
+
+ float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ // print sol
+ printf("\nux = \n\n");
+ for(ii=0; ii<=N; ii++)
+ d_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+ printf("\npi = \n\n");
+ for(ii=0; ii<N; ii++)
+ d_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+// printf("\nL = \n\n");
+// for(ii=0; ii<=N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii], &hsL[ii], 0, 0);
+
+ printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+ printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+ printf("\n");
+
+/************************************************
+* free memory
+************************************************/
+
+ d_free(A);
+ d_free(B);
+ d_free(b);
+ d_free(x0);
+ d_free(R);
+ d_free(S);
+ d_free(Q);
+ d_free(r);
+ d_free(q);
+ d_free(b0);
+ d_free_strmat(&sA);
+ d_free_strvec(&sb);
+ d_free_strmat(&sBbt0);
+ d_free_strvec(&sb0);
+ d_free_strmat(&sBAbt1);
+ d_free_strmat(&sRr0);
+ d_free_strvec(&sr0);
+ d_free_strmat(&sRSQrq1);
+ d_free_strvec(&srq1);
+ d_free_strmat(&sQqN);
+ d_free_strvec(&sqN);
+ d_free_strmat(&hsL[0]);
+ d_free_strvec(&hsPb[0]);
+ d_free_strvec(&hsux[0]);
+ d_free_strvec(&hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ d_free_strmat(&hsL[ii]);
+ d_free_strvec(&hsPb[ii]);
+ d_free_strvec(&hsux[ii]);
+ d_free_strvec(&hspi[ii]);
+ }
+ d_free_strmat(&hsL[N]);
+ d_free_strvec(&hsux[N]);
+ d_free_strmat(&hswork_mat[0]);
+ d_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/
+
+ return 0;
+
+ }
+
+
+
diff --git a/examples/example_s_lu_factorization.c b/examples/example_s_lu_factorization.c
new file mode 100644
index 0000000..e298604
--- /dev/null
+++ b/examples/example_s_lu_factorization.c
@@ -0,0 +1,211 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+
+ float *A; s_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// s_print_mat(n, n, A, n);
+
+ // spd matrix
+ float *B; s_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// s_print_mat(n, n, B, n);
+
+ // identity
+ float *I; s_zeros(&I, n, n);
+ for(ii=0; ii<n; ii++) I[ii*(n+1)] = 1.0;
+// s_print_mat(n, n, B, n);
+
+ // result matrix
+ float *D; s_zeros(&D, n, n);
+// s_print_mat(n, n, D, n);
+
+ // permutation indeces
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ //
+ // matrices in matrix struct format
+ //
+
+ // work space enough for 5 matrix structs for size n times n
+ int size_strmat = 5*s_size_strmat(n, n);
+ void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+ char *ptr_memory_strmat = (char *) memory_strmat;
+
+ struct s_strmat sA;
+// s_allocate_strmat(n, n, &sA);
+ s_create_strmat(n, n, &sA, ptr_memory_strmat);
+ ptr_memory_strmat += sA.memory_size;
+ // convert from column major matrix to strmat
+ s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ printf("\nA = \n");
+ s_print_strmat(n, n, &sA, 0, 0);
+
+ struct s_strmat sB;
+// s_allocate_strmat(n, n, &sB);
+ s_create_strmat(n, n, &sB, ptr_memory_strmat);
+ ptr_memory_strmat += sB.memory_size;
+ // convert from column major matrix to strmat
+ s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ printf("\nB = \n");
+ s_print_strmat(n, n, &sB, 0, 0);
+
+ struct s_strmat sI;
+// s_allocate_strmat(n, n, &sI);
+ s_create_strmat(n, n, &sI, ptr_memory_strmat);
+ ptr_memory_strmat += sI.memory_size;
+ // convert from column major matrix to strmat
+
+ struct s_strmat sD;
+// s_allocate_strmat(n, n, &sD);
+ s_create_strmat(n, n, &sD, ptr_memory_strmat);
+ ptr_memory_strmat += sD.memory_size;
+
+ struct s_strmat sLU;
+// s_allocate_strmat(n, n, &sD);
+ s_create_strmat(n, n, &sLU, ptr_memory_strmat);
+ ptr_memory_strmat += sLU.memory_size;
+
+ sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nB+A*A' = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+
+// sgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+ sgetrf_libstr(n, n, &sD, 0, 0, &sLU, 0, 0, ipiv);
+ printf("\nLU = \n");
+ s_print_strmat(n, n, &sLU, 0, 0);
+ printf("\nipiv = \n");
+ int_print_mat(1, n, ipiv, 1);
+
+#if 0 // solve P L U X = P B
+ s_cvt_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI = \n");
+ s_print_strmat(n, n, &sI, 0, 0);
+
+ srowpe_libstr(n, ipiv, &sI);
+ printf("\nperm(I) = \n");
+ s_print_strmat(n, n, &sI, 0, 0);
+
+ strsm_llnu_libstr(n, n, 1.0, &sLU, 0, 0, &sI, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L)) = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+ strsm_lunn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A) = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ s_cvt_strmat2mat(n, n, &sD, 0, 0, D, n);
+#else // solve X^T (P L U)^T = B^T P^T
+ s_cvt_tran_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI' = \n");
+ s_print_strmat(n, n, &sI, 0, 0);
+
+ scolpe_libstr(n, ipiv, &sB);
+ printf("\nperm(I') = \n");
+ s_print_strmat(n, n, &sB, 0, 0);
+
+ strsm_rltu_libstr(n, n, 1.0, &sLU, 0, 0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L')) = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+ strsm_rutn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A') = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ s_cvt_tran_strmat2mat(n, n, &sD, 0, 0, D, n);
+#endif
+
+ // print matrix in column-major format
+ printf("\ninv(A) = \n");
+ s_print_mat(n, n, D, n);
+
+
+
+ //
+ // free memory
+ //
+
+ s_free(A);
+ s_free(B);
+ s_free(D);
+ s_free(I);
+ int_free(ipiv);
+// s_free_strmat(&sA);
+// s_free_strmat(&sB);
+// s_free_strmat(&sD);
+// s_free_strmat(&sI);
+ v_free_align(memory_strmat);
+
+ return 0;
+
+ }
+
diff --git a/examples/example_s_riccati_recursion.c b/examples/example_s_riccati_recursion.c
new file mode 100644
index 0000000..03b9fc6
--- /dev/null
+++ b/examples/example_s_riccati_recursion.c
@@ -0,0 +1,605 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+
+static void s_back_ric_sv_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strmat *hsRSQrq, struct s_strmat *hsL, struct s_strvec *hsux, struct s_strvec *hspi, struct s_strmat *hswork_mat, struct s_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // factorization and backward substitution
+
+ // last stage
+ spotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ strmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+ sgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+ ssyrk_spotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ ssyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ spotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ srowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ srowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+static void s_back_ric_trf_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strmat *hsRSQrq, struct s_strmat *hsL, struct s_strmat *hswork_mat)
+ {
+
+ int nn;
+
+ // factorization
+
+ // last stage
+ spotrf_l_libstr(nx[N], nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ strmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+#if 1
+ ssyrk_spotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ ssyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ spotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ return;
+
+ }
+
+
+
+static void s_back_ric_trs_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strvec *hsb, struct s_strvec *hsrq, struct s_strmat *hsL, struct s_strvec *hsPb, struct s_strvec *hsux, struct s_strvec *hspi, struct s_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // backward substitution
+
+ // last stage
+ sveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+ // middle stages
+ for(nn=0; nn<N-1; nn++)
+ {
+ // compute Pb
+ strmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ strmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ sveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ sveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ sgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ strsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ }
+
+ // first stage
+ nn = N-1;
+ strmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ strmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ sveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ sveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ sgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ strsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ sveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ sveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+/************************************************
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts.
+************************************************/
+static void d_mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+ {
+
+ int nx2 = nx*nx;
+
+ int info = 0;
+
+ int pp = nx/2; // number of masses
+
+/************************************************
+* build the continuous time system
+************************************************/
+
+ double *T; d_zeros(&T, pp, pp);
+ int ii;
+ for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+ for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+ for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+ double *Z; d_zeros(&Z, pp, pp);
+ double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+ double *Ac; d_zeros(&Ac, nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac, nx);
+ dmcopy(pp, pp, T, pp, Ac+pp, nx);
+ dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx);
+ free(T);
+ free(Z);
+ free(I);
+
+ d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+ double *Bc; d_zeros(&Bc, nx, nu);
+ dmcopy(nu, nu, I, nu, Bc+pp, nx);
+ free(I);
+
+/************************************************
+* compute the discrete time system
+************************************************/
+
+ double *bb; d_zeros(&bb, nx, 1);
+ dmcopy(nx, 1, bb, nx, b, nx);
+
+ dmcopy(nx, nx, Ac, nx, A, nx);
+ dscal_3l(nx2, Ts, A);
+ expm(nx, A);
+
+ d_zeros(&T, nx, nx);
+ d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+ dmcopy(nx, nx, A, nx, T, nx);
+ daxpy_3l(nx2, -1.0, I, T);
+ dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+ free(T);
+ free(I);
+
+ int *ipiv = (int *) malloc(nx*sizeof(int));
+ dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+ free(ipiv);
+
+ free(Ac);
+ free(Bc);
+ free(bb);
+
+
+/************************************************
+* initial state
+************************************************/
+
+ if(nx==4)
+ {
+ x0[0] = 5;
+ x0[1] = 10;
+ x0[2] = 15;
+ x0[3] = 20;
+ }
+ else
+ {
+ int jj;
+ for(jj=0; jj<nx; jj++)
+ x0[jj] = 1;
+ }
+
+ }
+
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ // loop index
+ int ii;
+
+/************************************************
+* problem size
+************************************************/
+
+ // problem size
+ int N = 4;
+ int nx_ = 4;
+ int nu_ = 1;
+
+ // stage-wise variant size
+ int nx[N+1];
+ nx[0] = 0;
+ for(ii=1; ii<=N; ii++)
+ nx[ii] = nx_;
+ nx[N] = nx_;
+
+ int nu[N+1];
+ for(ii=0; ii<N; ii++)
+ nu[ii] = nu_;
+ nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/
+
+ double *Ad; d_zeros(&Ad, nx_, nx_); // states update matrix
+
+ double *Bd; d_zeros(&Bd, nx_, nu_); // inputs matrix
+
+ double *bd; d_zeros(&bd, nx_, 1); // states offset
+ double *x0d; d_zeros(&x0d, nx_, 1); // initial state
+
+ double Ts = 0.5; // sampling time
+ d_mass_spring_system(Ts, nx_, nu_, N, Ad, Bd, bd, x0d);
+
+ float *A; s_zeros(&A, nx_, nx_); for(ii=0; ii<nx_*nx_; ii++) A[ii] = (float) Ad[ii];
+ float *B; s_zeros(&B, nx_, nu_); for(ii=0; ii<nx_*nu_; ii++) B[ii] = (float) Bd[ii];
+ float *b; s_zeros(&b, nx_, 1); for(ii=0; ii<nx_; ii++) b[ii] = (float) bd[ii];
+ float *x0; s_zeros(&x0, nx_, 1); for(ii=0; ii<nx_; ii++) x0[ii] = (float) x0d[ii];
+
+ for(ii=0; ii<nx_; ii++)
+ b[ii] = 0.1;
+
+ for(ii=0; ii<nx_; ii++)
+ x0[ii] = 0;
+ x0[0] = 2.5;
+ x0[1] = 2.5;
+
+ s_print_mat(nx_, nx_, A, nx_);
+ s_print_mat(nx_, nu_, B, nx_);
+ s_print_mat(1, nx_, b, 1);
+ s_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/
+
+ float *R; s_zeros(&R, nu_, nu_);
+ for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+ float *S; s_zeros(&S, nu_, nx_);
+
+ float *Q; s_zeros(&Q, nx_, nx_);
+ for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+ float *r; s_zeros(&r, nu_, 1);
+ for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+ float *q; s_zeros(&q, nx_, 1);
+ for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+ s_print_mat(nu_, nu_, R, nu_);
+ s_print_mat(nu_, nx_, S, nu_);
+ s_print_mat(nx_, nx_, Q, nx_);
+ s_print_mat(1, nu_, r, 1);
+ s_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/
+
+ struct s_strmat sA;
+ s_allocate_strmat(nx_, nx_, &sA);
+ s_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+ struct s_strvec sb;
+ s_allocate_strvec(nx_, &sb);
+ s_cvt_vec2strvec(nx_, b, &sb, 0);
+ struct s_strvec sx0;
+ s_allocate_strvec(nx_, &sx0);
+ s_cvt_vec2strvec(nx_, x0, &sx0, 0);
+ struct s_strvec sb0;
+ s_allocate_strvec(nx_, &sb0);
+ float *b0; d_zeros(&b0, nx_, 1); // states offset
+ sgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+ s_print_tran_strvec(nx_, &sb0, 0);
+
+ struct s_strmat sBbt0;
+ s_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+ s_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+ srowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+ s_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+ struct s_strmat sBAbt1;
+ s_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+ s_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+ s_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+ s_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+ s_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+ struct s_strvec sr0; // XXX no need to update r0 since S=0
+ s_allocate_strvec(nu_, &sr0);
+ s_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+ struct s_strmat sRr0;
+ s_allocate_strmat(nu_+1, nu_, &sRr0);
+ s_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+ srowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+ s_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+ struct s_strvec srq1;
+ s_allocate_strvec(nu_+nx_, &srq1);
+ s_cvt_vec2strvec(nu_, r, &srq1, 0);
+ s_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+ struct s_strmat sRSQrq1;
+ s_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+ s_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+ s_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+ s_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+ srowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+ s_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+ struct s_strvec sqN;
+ s_allocate_strvec(nx_, &sqN);
+ s_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+ struct s_strmat sQqN;
+ s_allocate_strmat(nx_+1, nx_, &sQqN);
+ s_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+ srowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+ s_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/
+
+ struct s_strmat hsBAbt[N];
+ struct s_strvec hsb[N];
+ struct s_strmat hsRSQrq[N+1];
+ struct s_strvec hsrq[N+1];
+ struct s_strmat hsL[N+1];
+ struct s_strvec hsPb[N];
+ struct s_strvec hsux[N+1];
+ struct s_strvec hspi[N];
+ struct s_strmat hswork_mat[1];
+ struct s_strvec hswork_vec[1];
+
+ hsBAbt[0] = sBbt0;
+ hsb[0] = sb0;
+ hsRSQrq[0] = sRr0;
+ hsrq[0] = sr0;
+ s_allocate_strmat(nu_+1, nu_, &hsL[0]);
+ s_allocate_strvec(nx_, &hsPb[0]);
+ s_allocate_strvec(nx_+nu_+1, &hsux[0]);
+ s_allocate_strvec(nx_, &hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ hsBAbt[ii] = sBAbt1;
+ hsb[ii] = sb;
+ hsRSQrq[ii] = sRSQrq1;
+ hsrq[ii] = srq1;
+ s_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+ s_allocate_strvec(nx_, &hsPb[ii]);
+ s_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+ s_allocate_strvec(nx_, &hspi[ii]);
+ }
+ hsRSQrq[N] = sQqN;
+ hsrq[N] = sqN;
+ s_allocate_strmat(nx_+1, nx_, &hsL[N]);
+ s_allocate_strvec(nx_+nu_+1, &hsux[N]);
+ s_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+ s_allocate_strvec(nx_, &hswork_vec[0]);
+
+// for(ii=0; ii<N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+// return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/
+
+ // timing
+ struct timeval tv0, tv1, tv2, tv3;
+ int nrep = 1000;
+ int rep;
+
+ gettimeofday(&tv0, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ s_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsux, hspi, hswork_mat, hswork_vec);
+ }
+
+ gettimeofday(&tv1, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ s_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hswork_mat);
+ }
+
+ gettimeofday(&tv2, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ s_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsPb, hsux, hspi, hswork_vec);
+ }
+
+ gettimeofday(&tv3, NULL); // time
+
+ float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ // print sol
+ printf("\nux = \n\n");
+ for(ii=0; ii<=N; ii++)
+ s_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+ printf("\npi = \n\n");
+ for(ii=0; ii<N; ii++)
+ s_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+// printf("\nL = \n\n");
+// for(ii=0; ii<=N; ii++)
+// s_print_strmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii], &hsL[ii], 0, 0);
+
+ printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+ printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+ printf("\n");
+
+/************************************************
+* free memory
+************************************************/
+
+ d_free(Ad);
+ d_free(Bd);
+ d_free(bd);
+ d_free(x0d);
+ s_free(A);
+ s_free(B);
+ s_free(b);
+ s_free(x0);
+ s_free(R);
+ s_free(S);
+ s_free(Q);
+ s_free(r);
+ s_free(q);
+ s_free(b0);
+ s_free_strmat(&sA);
+ s_free_strvec(&sb);
+ s_free_strmat(&sBbt0);
+ s_free_strvec(&sb0);
+ s_free_strmat(&sBAbt1);
+ s_free_strmat(&sRr0);
+ s_free_strvec(&sr0);
+ s_free_strmat(&sRSQrq1);
+ s_free_strvec(&srq1);
+ s_free_strmat(&sQqN);
+ s_free_strvec(&sqN);
+ s_free_strmat(&hsL[0]);
+ s_free_strvec(&hsPb[0]);
+ s_free_strvec(&hsux[0]);
+ s_free_strvec(&hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ s_free_strmat(&hsL[ii]);
+ s_free_strvec(&hsPb[ii]);
+ s_free_strvec(&hsux[ii]);
+ s_free_strvec(&hspi[ii]);
+ }
+ s_free_strmat(&hsL[N]);
+ s_free_strvec(&hsux[N]);
+ s_free_strmat(&hswork_mat[0]);
+ s_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/
+
+ return 0;
+
+ }
+
+
+
+
diff --git a/examples/example_tree_riccati_recursion.c b/examples/example_tree_riccati_recursion.c
new file mode 100644
index 0000000..b61d2d3
--- /dev/null
+++ b/examples/example_tree_riccati_recursion.c
@@ -0,0 +1,638 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+
+void d_back_ric_sv_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strmat *hswork_mat, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // factorization and backward substitution
+
+ // last stage
+ dpotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+ dtrtr_l_libstr(nx[N], &hsL[N], 0, 0, &hsLxt[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ dtrmm_rutn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hsLxt[N-nn], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+ dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ dtrtr_l_libstr(nx[N-nn-1], &hsL[N-nn-1], nu[N-nn-1], nu[N-nn-1], &hsLxt[N-nn-1], 0, 0);
+ }
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ drowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ drowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_funnel1_libstr(int md, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt_old, struct d_strmat *hsLxt_new, struct d_strmat *hswork_mat)
+ {
+
+ int ii;
+
+ ii = 0;
+ dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[ii], 0, 0, &hsLxt_old[ii], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+ for(ii=1; ii<md; ii++)
+ {
+ dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[ii], 0, 0, &hsLxt_old[ii], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsL[0], 0, 0, &hsL[0], 0, 0);
+ }
+
+ dpotrf_l_libstr(nu[0]+nx[0], nu[0]+nx[0], &hsL[0], 0, 0, &hsL[0], 0, 0);
+ dtrtr_l_libstr(nx[0], &hsL[0], nu[0], nu[0], &hsLxt_new[0], 0, 0);
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_step1_libstr(int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strmat *hswork_mat)
+ {
+
+ dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[0], 0, 0, &hsLxt[1], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+ dpotrf_l_libstr(nu[0]+nx[0], nu[0]+nx[0], &hsL[0], 0, 0, &hsL[0], 0, 0);
+ dtrtr_l_libstr(nx[0], &hsL[0], nu[0], nu[0], &hsLxt[0], 0, 0);
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_stepN_libstr(int *nx, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt)
+ {
+
+ dpotrf_l_libstr(nx[0], nx[0], &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+ dtrtr_l_libstr(nx[0], &hsL[0], 0, 0, &hsLxt[0], 0, 0);
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strmat *hswork_mat)
+ {
+
+ int nn;
+
+ // factorization
+
+ // last stage
+ d_back_ric_trf_stepN_libstr(&nx[N], &hsRSQrq[N], &hsL[N], &hsLxt[N]);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ d_back_ric_trf_step1_libstr(&nx[N-nn-1], &nu[N-nn-1], &hsBAbt[N-nn-1], &hsRSQrq[N-nn-1], &hsL[N-nn-1], &hsLxt[N-nn-1], hswork_mat);
+ }
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trs_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strvec *hsb, struct d_strvec *hsrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strvec *hsPb, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // backward substitution
+
+ // last stage
+ dveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+ // middle stages
+ for(nn=0; nn<N-1; nn++)
+ {
+ // compute Pb
+ dtrmv_unn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_utn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ }
+
+ // first stage
+ nn = N-1;
+ dtrmv_unn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_utn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+/************************************************
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts.
+************************************************/
+void mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+ {
+
+ int nx2 = nx*nx;
+
+ int info = 0;
+
+ int pp = nx/2; // number of masses
+
+/************************************************
+* build the continuous time system
+************************************************/
+
+ double *T; d_zeros(&T, pp, pp);
+ int ii;
+ for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+ for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+ for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+ double *Z; d_zeros(&Z, pp, pp);
+ double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+ double *Ac; d_zeros(&Ac, nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac, nx);
+ dmcopy(pp, pp, T, pp, Ac+pp, nx);
+ dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx);
+ free(T);
+ free(Z);
+ free(I);
+
+ d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+ double *Bc; d_zeros(&Bc, nx, nu);
+ dmcopy(nu, nu, I, nu, Bc+pp, nx);
+ free(I);
+
+/************************************************
+* compute the discrete time system
+************************************************/
+
+ double *bb; d_zeros(&bb, nx, 1);
+ dmcopy(nx, 1, bb, nx, b, nx);
+
+ dmcopy(nx, nx, Ac, nx, A, nx);
+ dscal_3l(nx2, Ts, A);
+ expm(nx, A);
+
+ d_zeros(&T, nx, nx);
+ d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+ dmcopy(nx, nx, A, nx, T, nx);
+ daxpy_3l(nx2, -1.0, I, T);
+ dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+ free(T);
+ free(I);
+
+ int *ipiv = (int *) malloc(nx*sizeof(int));
+ dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+ free(ipiv);
+
+ free(Ac);
+ free(Bc);
+ free(bb);
+
+
+/************************************************
+* initial state
+************************************************/
+
+ if(nx==4)
+ {
+ x0[0] = 5;
+ x0[1] = 10;
+ x0[2] = 15;
+ x0[3] = 20;
+ }
+ else
+ {
+ int jj;
+ for(jj=0; jj<nx; jj++)
+ x0[jj] = 1;
+ }
+
+ }
+
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ // loop index
+ int ii;
+
+/************************************************
+* problem size
+************************************************/
+
+ // problem size
+ int N = 4;
+ int nx_ = 8;
+ int nu_ = 3;
+
+ // stage-wise variant size
+ int nx[N+1];
+ nx[0] = 0;
+ for(ii=1; ii<=N; ii++)
+ nx[ii] = nx_;
+ nx[N] = nx_;
+
+ int nu[N+1];
+ for(ii=0; ii<N; ii++)
+ nu[ii] = nu_;
+ nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/
+
+ double *A; d_zeros(&A, nx_, nx_); // states update matrix
+
+ double *B; d_zeros(&B, nx_, nu_); // inputs matrix
+
+ double *b; d_zeros(&b, nx_, 1); // states offset
+ double *x0; d_zeros_align(&x0, nx_, 1); // initial state
+
+ double Ts = 0.5; // sampling time
+ mass_spring_system(Ts, nx_, nu_, N, A, B, b, x0);
+
+ for(ii=0; ii<nx_; ii++)
+ b[ii] = 0.1;
+
+ for(ii=0; ii<nx_; ii++)
+ x0[ii] = 0;
+ x0[0] = 2.5;
+ x0[1] = 2.5;
+
+ d_print_mat(nx_, nx_, A, nx_);
+ d_print_mat(nx_, nu_, B, nx_);
+ d_print_mat(1, nx_, b, 1);
+ d_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/
+
+ double *R; d_zeros(&R, nu_, nu_);
+ for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+ double *S; d_zeros(&S, nu_, nx_);
+
+ double *Q; d_zeros(&Q, nx_, nx_);
+ for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+ double *r; d_zeros(&r, nu_, 1);
+ for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+ double *q; d_zeros(&q, nx_, 1);
+ for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+ d_print_mat(nu_, nu_, R, nu_);
+ d_print_mat(nu_, nx_, S, nu_);
+ d_print_mat(nx_, nx_, Q, nx_);
+ d_print_mat(1, nu_, r, 1);
+ d_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/
+
+ struct d_strmat sA;
+ d_allocate_strmat(nx_, nx_, &sA);
+ d_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+ struct d_strvec sb;
+ d_allocate_strvec(nx_, &sb);
+ d_cvt_vec2strvec(nx_, b, &sb, 0);
+ struct d_strvec sx0;
+ d_allocate_strvec(nx_, &sx0);
+ d_cvt_vec2strvec(nx_, x0, &sx0, 0);
+ struct d_strvec sb0;
+ d_allocate_strvec(nx_, &sb0);
+ double *b0; d_zeros(&b0, nx_, 1); // states offset
+ dgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+ d_print_tran_strvec(nx_, &sb0, 0);
+
+ struct d_strmat sBbt0;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+ d_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+ drowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+ d_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+ struct d_strmat sBAbt1;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+ d_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+ d_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+ d_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+ struct d_strvec sr0; // XXX no need to update r0 since S=0
+ d_allocate_strvec(nu_, &sr0);
+ d_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+ struct d_strmat sRr0;
+ d_allocate_strmat(nu_+1, nu_, &sRr0);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+ drowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+ d_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+ struct d_strvec srq1;
+ d_allocate_strvec(nu_+nx_, &srq1);
+ d_cvt_vec2strvec(nu_, r, &srq1, 0);
+ d_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+ struct d_strmat sRSQrq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+ d_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+ drowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+ struct d_strvec sqN;
+ d_allocate_strvec(nx_, &sqN);
+ d_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+ struct d_strmat sQqN;
+ d_allocate_strmat(nx_+1, nx_, &sQqN);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+ drowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+ d_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/
+
+ struct d_strmat hsBAbt[N];
+ struct d_strvec hsb[N];
+ struct d_strmat hsRSQrq[N+1];
+ struct d_strvec hsrq[N+1];
+ struct d_strmat hsL[N+1];
+ struct d_strmat hsLxt[N+1];
+ struct d_strvec hsPb[N];
+ struct d_strvec hsux[N+1];
+ struct d_strvec hspi[N];
+ struct d_strmat hswork_mat[1];
+ struct d_strvec hswork_vec[1];
+
+ hsBAbt[0] = sBbt0;
+ hsb[0] = sb0;
+ hsRSQrq[0] = sRr0;
+ hsrq[0] = sr0;
+ d_allocate_strmat(nu_+1, nu_, &hsL[0]);
+// d_allocate_strmat(nu_+1, nu_, &hsLxt[0]);
+ d_allocate_strvec(nx_, &hsPb[0]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[0]);
+ d_allocate_strvec(nx_, &hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ hsBAbt[ii] = sBAbt1;
+ hsb[ii] = sb;
+ hsRSQrq[ii] = sRSQrq1;
+ hsrq[ii] = srq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+ d_allocate_strmat(nx_, nu_+nx_, &hsLxt[ii]);
+ d_allocate_strvec(nx_, &hsPb[ii]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+ d_allocate_strvec(nx_, &hspi[ii]);
+ }
+ hsRSQrq[N] = sQqN;
+ hsrq[N] = sqN;
+ d_allocate_strmat(nx_+1, nx_, &hsL[N]);
+ d_allocate_strmat(nx_, nx_, &hsLxt[N]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[N]);
+ d_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+ d_allocate_strvec(nx_, &hswork_vec[0]);
+
+// for(ii=0; ii<N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+// return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/
+
+ // timing
+ struct timeval tv0, tv1, tv2, tv3;
+ int nrep = 1000;
+ int rep;
+
+ gettimeofday(&tv0, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+// d_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsLxt, hsux, hspi, hswork_mat, hswork_vec);
+ }
+
+ gettimeofday(&tv1, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsLxt, hswork_mat);
+ }
+
+ gettimeofday(&tv2, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsLxt, hsPb, hsux, hspi, hswork_vec);
+ }
+
+ gettimeofday(&tv3, NULL); // time
+
+ float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ // print sol
+ printf("\nux = \n\n");
+ for(ii=0; ii<=N; ii++)
+ d_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+ printf("\npi = \n\n");
+ for(ii=0; ii<N; ii++)
+ d_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+ printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+ printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+ printf("\n");
+
+/************************************************
+* free memory
+************************************************/
+
+ d_free(A);
+ d_free(B);
+ d_free(b);
+ d_free_align(x0);
+ d_free(R);
+ d_free(S);
+ d_free(Q);
+ d_free(r);
+ d_free(q);
+ d_free(b0);
+ d_free_strmat(&sA);
+ d_free_strvec(&sb);
+ d_free_strmat(&sBbt0);
+ d_free_strvec(&sb0);
+ d_free_strmat(&sBAbt1);
+ d_free_strmat(&sRr0);
+ d_free_strvec(&sr0);
+ d_free_strmat(&sRSQrq1);
+ d_free_strvec(&srq1);
+ d_free_strmat(&sQqN);
+ d_free_strvec(&sqN);
+ d_free_strmat(&hsL[0]);
+// d_free_strmat(&hsLxt[0]);
+ d_free_strvec(&hsPb[0]);
+ d_free_strvec(&hsux[0]);
+ d_free_strvec(&hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ d_free_strmat(&hsL[ii]);
+ d_free_strmat(&hsLxt[ii]);
+ d_free_strvec(&hsPb[ii]);
+ d_free_strvec(&hsux[ii]);
+ d_free_strvec(&hspi[ii]);
+ }
+ d_free_strmat(&hsL[N]);
+ d_free_strmat(&hsLxt[N]);
+ d_free_strvec(&hsux[N]);
+ d_free_strmat(&hswork_mat[0]);
+ d_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/
+
+ return 0;
+
+ }
+
+
+
diff --git a/examples/tools.c b/examples/tools.c
new file mode 100644
index 0000000..51d9e95
--- /dev/null
+++ b/examples/tools.c
@@ -0,0 +1,724 @@
+/**************************************************************************************************
+* *
+* This file is part of HPMPC. *
+* *
+* HPMPC -- Library for High-Performance implementation of solvers for MPC. *
+* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* *
+**************************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+//#include "../include/aux_d.h"
+
+//void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc);
+//void dgesv_(int *n, int *nrhs, double *A, int *lda, int *ipiv, double *B, int *ldb, int *info);
+//void dcopy_(int *n, double *dx, int *incx, double *dy, int *incy);
+//void daxpy_(int *n, double *da, double *dx, int *incx, double *dy, int *incy);
+//void dscal_(int *n, double *da, double *dx, int *incx);
+
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+
+
+/************************************************
+ matrix-matrix multiplication
+************************************************/
+void dgemm_nn_3l(int m, int n, int k, double *A, int lda , double *B, int ldb, double *C, int ldc)
+ {
+
+ int ii, jj, kk;
+
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ C[ii+ldc*jj] = 0;
+ for(kk=0; kk<k; kk++)
+ {
+ C[ii+ldc*jj] += A[ii+lda*kk] * B[kk+ldb*jj];
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+void daxpy_3l(int n, double da, double *dx, double *dy)
+ {
+ int i;
+ for(i=0; i<n; i++)
+ {
+ dy[i] += da*dx[i];
+ }
+ }
+
+
+
+void dscal_3l(int n, double da, double *dx)
+ {
+ int i;
+ for(i=0; i<n; i++)
+ {
+ dx[i] *= da;
+ }
+ }
+
+
+
+/************************************************
+ Routine that copies a matrix
+************************************************/
+void dmcopy(int row, int col, double *A, int lda, double *B, int ldb)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ B[i+j*ldb] = A[i+j*lda];
+ }
+ }
+ }
+
+
+
+int idamax_3l(int n, double *x)
+ {
+
+ if(n<=0)
+ return 0;
+ if(n==1)
+ return 0;
+
+ double dabs;
+ double dmax = (x[0]>0 ? x[0] : -x[0]);
+ int idmax = 0;
+ int jj;
+ for(jj=1; jj<n; jj++)
+ {
+ dabs = (x[jj]>0 ? x[jj] : -x[jj]);
+ if(dabs>dmax)
+ {
+ dmax = dabs;
+ idmax = jj;
+ }
+ }
+
+ return idmax;
+
+ }
+
+
+
+void dswap_3l(int n, double *x, int incx, double *y, int incy)
+ {
+
+ if(n<=0)
+ return;
+
+ double temp;
+ int jj;
+ for(jj=0; jj<n; jj++)
+ {
+ temp = x[0];
+ x[0] = y[0];
+ y[0] = temp;
+ x += incx;
+ y += incy;
+ }
+
+ }
+
+
+
+void dger_3l(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *A, int lda)
+ {
+
+ if(m==0 || n==0 || alpha==0.0)
+ return;
+
+ int i, j;
+ double *px, *py, temp;
+
+ py = y;
+ for(j=0; j<n; j++)
+ {
+ temp = alpha * py[0];
+ px = x;
+ for(i=0; i<m; i++)
+ {
+ A[i+lda*j] += px[0] * temp;
+ px += incx;
+ }
+ py += incy;
+ }
+
+ return;
+
+ }
+
+
+
+void dgetf2_3l(int m, int n, double *A, int lda, int *ipiv, int *info)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ int i, j, jp;
+
+ double Ajj;
+
+ int size_min = ( m<n ? m : n );
+
+ for(j=0; j<size_min; j++)
+ // find the pivot and test for singularity
+ {
+ jp = j + idamax_3l(m-j, &A[j+lda*j]);
+ ipiv[j] = jp;
+ if( A[jp+lda*j]!=0)
+ {
+ // apply the interchange to columns 0:n-1
+ if(jp!=j)
+ {
+ dswap_3l(n, &A[j], lda, &A[jp], lda);
+ }
+ // compute elements j+1:m-1 of j-th column
+ if(j<m-1)
+ {
+ Ajj = A[j+lda*j];
+ if( ( Ajj>0 ? Ajj : -Ajj ) >= 2.22e-16 )
+ {
+ dscal_3l(m-j-1, 1.0/Ajj, &A[j+1+lda*j]);
+ }
+ else
+ {
+ for(i=j+1; i<m; i++)
+ {
+ A[i+lda*j] /= Ajj;
+ }
+ }
+ }
+ }
+ else if(*info==0)
+ {
+ *info = j+1;
+ }
+
+ if( j < size_min )
+ {
+ // update trailing submatrix
+ dger_3l(m-j-1, n-j-1, -1.0, &A[j+1+lda*j], 1, &A[j+lda*(j+1)], lda, &A[j+1+lda*(j+1)], lda);
+ }
+
+ }
+
+ return;
+
+ }
+
+
+
+void dlaswp_3l(int n, double *A, int lda, int k1, int k2, int *ipiv)
+ {
+
+ int i, j, k, ix, ix0, i1, i2, n32, ip;
+ double temp;
+
+ ix0 = k1;
+ i1 = k1;
+ i2 = k2;
+
+ n32 = (n/32)*32;
+ if(n32!=0)
+ {
+ for(j=0; j<n32; j+=32)
+ {
+ ix = ix0;
+ for(i=i1; i<i2; i++)
+ {
+ ip = ipiv[ix];
+ if(ip!=i)
+ {
+ for(k=j; k<j+32; k++)
+ {
+ temp = A[i+lda*k];
+ A[i+lda*k] = A[ip+lda*k];
+ A[ip+lda*k] = temp;
+ }
+ }
+ ix++;
+ }
+ }
+ }
+ if(n32!=n)
+ {
+ ix = ix0;
+ for(i=i1; i<i2; i++)
+ {
+ ip = ipiv[ix];
+ if(ip!=i)
+ {
+ for(k=n32; k<n; k++)
+ {
+ temp = A[i+lda*k];
+ A[i+lda*k] = A[ip+lda*k];
+ A[ip+lda*k] = temp;
+ }
+ }
+ ix++;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// left lower no-transp unit
+void dtrsm_l_l_n_u_3l(int m, int n, double *A, int lda, double *B, int ldb)
+ {
+
+ if(m==0 || n==0)
+ return;
+
+ int i, j, k;
+
+ for(j=0; j<n; j++)
+ {
+ for(k=0; k<m; k++)
+ {
+ for(i=k+1; i<m; i++)
+ {
+ B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// left upper no-transp non-unit
+void dtrsm_l_u_n_n_3l(int m, int n, double *A, int lda, double *B, int ldb)
+ {
+
+ if(m==0 || n==0)
+ return;
+
+ int i, j, k;
+
+ for(j=0; j<n; j++)
+ {
+ for(k=m-1; k>=0; k--)
+ {
+ B[k+ldb*j] /= A[k+lda*k];
+ for(i=0; i<k; i++)
+ {
+ B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void dgetrs_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info)
+ {
+
+ if(n==0 || nrhs==0)
+ return;
+
+ // solve A * X = B
+
+ // apply row interchanges to the rhs
+ dlaswp_3l(nrhs, B, ldb, 0, n, ipiv);
+
+ // solve L*X = B, overwriting B with X
+ dtrsm_l_l_n_u_3l(n, nrhs, A, lda, B, ldb);
+
+ // solve U*X = B, overwriting B with X
+ dtrsm_l_u_n_n_3l(n, nrhs, A, lda, B, ldb);
+
+ return;
+
+ }
+
+
+
+void dgesv_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info)
+ {
+
+ // compute the LU factorization of A
+ dgetf2_3l(n, n, A, lda, ipiv, info);
+
+ if(*info==0)
+ {
+ // solve the system A*X = B, overwriting B with X
+ dgetrs_3l(n, nrhs, A, lda, ipiv, B, ldb, info);
+ }
+
+ return;
+
+ }
+
+
+
+/* one norm of a matrix */
+double onenorm(int row, int col, double *ptrA)
+ {
+ double max, temp;
+ int i, j;
+ temp = 0;
+ for(j=0; j<col; j++)
+ {
+ temp = abs(*(ptrA+j*row));
+ for(i=1; i<row; i++)
+ {
+ temp += abs(*(ptrA+j*row+i));
+ }
+ if(j==0) max = temp;
+ else if(max>temp) temp = max;
+ }
+ return temp;
+ }
+
+
+
+/* computes the Pade approximation of degree m of the matrix A */
+void padeapprox(int m, int row, double *A)
+ {
+ int ii;
+ int row2 = row*row;
+/* int i1 = 1;*/
+/* double d0 = 0;*/
+/* double d1 = 1;*/
+/* double dm1 = -1;*/
+
+ double *U = (double *) malloc(row*row*sizeof(double)); // d_zeros(&U, row, row);
+ double *V = (double *) malloc(row*row*sizeof(double)); // d_zeros(&V, row, row);
+
+ if(m==3)
+ {
+ double c[] = {120, 60, 12, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dscal_(&row2, &d0, temp, &i1);
+ dscal_3l(row2, 0, temp);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &d0, V, &i1);
+ dscal_3l(row2, 0, V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+ free(A0);
+ free(A2);
+ free(temp);
+ }
+ else if(m==5)
+ {
+ double c[] = {30240, 15120, 3360, 420, 30, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+ dmcopy(row, row, A4, row, V, row);
+ dmcopy(row, row, A4, row, temp, row);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &c[4], V, &i1);
+ dscal_3l(row2, c[4], V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(temp);
+ }
+ else if(m==7)
+ {
+ double c[] = {17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 1;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+ dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+// dscal_(&row2, &d0, temp, &i1);
+ dscal_3l(row2, 0, temp);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[5], A4, temp);
+// daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[7], A6, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &d0, V, &i1);
+ dscal_3l(row2, 0, V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+// daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+ daxpy_3l(row2, c[4], A4, V);
+// daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+ daxpy_3l(row2, c[6], A6, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(A6);
+ free(temp);
+ }
+ else if(m==9)
+ {
+ double c[] = {17643225600, 8821612800, 2075673600, 302702400, 30270240, 2162160, 110880, 3960, 90, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A8 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+ dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, A2, &row, &beta, A8, &row);
+ dgemm_nn_3l(row, row, row, A6, row, A2, row, A8, row);
+ dmcopy(row, row, A8, row, V, row);
+ dmcopy(row, row, A8, row, temp, row);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[5], A4, temp);
+// daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[7], A6, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &c[8], V, &i1);
+ dscal_3l(row2, c[8], V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+// daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+ daxpy_3l(row2, c[4], A4, V);
+// daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+ daxpy_3l(row2, c[6], A6, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(A6);
+ free(A8);
+ free(temp);
+ }
+ else if(m==13) // tested
+ {
+ double c[] = {64764752532480000, 32382376266240000, 7771770303897600, 1187353796428800, 129060195264000, 10559470521600, 670442572800, 33522128640, 1323241920, 40840800, 960960, 16380, 182, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+ dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+ dmcopy(row, row, A2, row, U, row);
+// dscal_(&row2, &c[9], U, &i1);
+ dscal_3l(row2, c[9], U);
+// daxpy_(&row2, &c[11], A4, &i1, U, &i1);
+ daxpy_3l(row2, c[11], A4, U);
+// daxpy_(&row2, &c[13], A6, &i1, U, &i1);
+ daxpy_3l(row2, c[13], A6, U);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, U, &row, &beta, temp, &row);
+ dgemm_nn_3l(row, row, row, A6, row, U, row, temp, row);
+// daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[7], A6, temp);
+// daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[5], A4, temp);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+ dmcopy(row, row, A2, row, temp, row);
+// dscal_(&row2, &c[8], V, &i1);
+ dscal_3l(row2, c[8], V);
+// daxpy_(&row2, &c[12], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[12], A6, temp);
+// daxpy_(&row2, &c[10], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[10], A4, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, temp, &row, &beta, V, &row);
+ dgemm_nn_3l(row, row, row, A6, row, temp, row, V, row);
+// daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+ daxpy_3l(row2, c[6], A6, V);
+// daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+ daxpy_3l(row2, c[4], A4, V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(A6);
+ free(temp);
+ }
+ else
+ {
+ printf("%s\n", "Wrong Pade approximatin degree");
+ exit(1);
+ }
+ double *D = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+// dcopy_(&row2, V, &i1, A, &i1);
+ dmcopy(row, row, V, row, A, row);
+// daxpy_(&row2, &d1, U, &i1, A, &i1);
+ daxpy_3l(row2, 1.0, U, A);
+// dcopy_(&row2, V, &i1, D, &i1);
+ dmcopy(row, row, V, row, D, row);
+// daxpy_(&row2, &dm1, U, &i1, D, &i1);
+ daxpy_3l(row2, -1.0, U, D);
+ int *ipiv = (int *) malloc(row*sizeof(int));
+ int info = 0;
+// dgesv_(&row, &row, D, &row, ipiv, A, &row, &info);
+ dgesv_3l(row, row, D, row, ipiv, A, row, &info);
+ free(ipiv);
+ free(D);
+ free(U);
+ free(V);
+ }
+
+
+
+void expm(int row, double *A)
+ {
+
+ int i;
+
+ int m_vals[] = {3, 5, 7, 9, 13};
+ double theta[] = {0.01495585217958292, 0.2539398330063230, 0.9504178996162932, 2.097847961257068, 5.371920351148152};
+ int lentheta = 5;
+
+ double normA = onenorm(row, row, A);
+
+ if(normA<=theta[4])
+ {
+ for(i=0; i<lentheta; i++)
+ {
+ if(normA<=theta[i])
+ {
+ padeapprox(m_vals[i], row, A);
+ break;
+ }
+ }
+ }
+ else
+ {
+ int s;
+ double t = frexp(normA/(theta[4]), &s);
+ s = s - (t==0.5);
+ t = pow(2,-s);
+ int row2 = row*row;
+/* int i1 = 1;*/
+// dscal_(&row2, &t, A, &i1);
+ dscal_3l(row2, t, A);
+ padeapprox(m_vals[4], row, A);
+ double *temp = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+ for(i=0; i<s; i++)
+ {
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, temp, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, temp, row);
+ dmcopy(row, row, temp, row, A, row);
+ }
+ free(temp);
+ }
+ }
+
+
diff --git a/examples/tools.h b/examples/tools.h
new file mode 100644
index 0000000..b017301
--- /dev/null
+++ b/examples/tools.h
@@ -0,0 +1,37 @@
+/**************************************************************************************************
+* *
+* This file is part of HPMPC. *
+* *
+* HPMPC -- Library for High-Performance implementation of solvers for MPC. *
+* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* *
+**************************************************************************************************/
+
+void dgemm_nn_3l(int m, int n, int k, double *A, int lda , double *B, int ldb, double *C, int ldc);
+void daxpy_3l(int n, double da, double *dx, double *dy);
+void dscal_3l(int n, double da, double *dx);
+
+/* copies a matrix into another matrix */
+void dmcopy(int row, int col, double *ptrA, int lda, double *ptrB, int ldb);
+
+/* solution of a system of linear equations */
+void dgesv_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info);
+
+/* matrix exponential */
+void expm(int row, double *A);
diff --git a/include/blasfeo_block_size.h b/include/blasfeo_block_size.h
new file mode 100644
index 0000000..9b74139
--- /dev/null
+++ b/include/blasfeo_block_size.h
@@ -0,0 +1,88 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifndef BLASFEO_BLOCK_SIZE
+#define BLASFEO_BLOCK_SIZE
+
+
+
+#if defined( TARGET_X64_INTEL_HASWELL )
+
+#define D_PS 4
+#define S_PS 8
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_X64_INTEL_SANDY_BRIDGE )
+
+#define D_PS 4
+#define S_PS 8
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_X64_INTEL_CORE )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_X64_AMD_BULLDOZER )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_ARMV8A_ARM_CORTEX_A57 )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4
+#define S_NC 4
+
+#elif defined( TARGET_ARMV7A_ARM_CORTEX_A15 )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_GENERIC )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#else
+#error "Unknown architecture"
+#endif
+
+
+#endif // BLASFEO_BLOCK_SIZE
diff --git a/include/blasfeo_common.h b/include/blasfeo_common.h
new file mode 100644
index 0000000..3f95c91
--- /dev/null
+++ b/include/blasfeo_common.h
@@ -0,0 +1,146 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+#ifndef BLASFEO_COMMON
+#define BLASFEO_COMMON
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+#include "blasfeo_block_size.h"
+
+// matrix structure
+struct d_strmat
+ {
+ int m; // rows
+ int n; // cols
+ int pm; // packed number or rows
+ int cn; // packed number or cols
+ double *pA; // pointer to a pm*pn array of doubles, the first is aligned to cache line size
+ double *dA; // pointer to a min(m,n) (or max???) array of doubles
+ int use_dA; // flag to tell if dA can be used
+ int memory_size; // size of needed memory
+ };
+
+struct s_strmat
+ {
+ int m; // rows
+ int n; // cols
+ int pm; // packed number or rows
+ int cn; // packed number or cols
+ float *pA; // pointer to a pm*pn array of floats, the first is aligned to cache line size
+ float *dA; // pointer to a min(m,n) (or max???) array of floats
+ int use_dA; // flag to tell if dA can be used
+ int memory_size; // size of needed memory
+ };
+
+// vector structure
+struct d_strvec
+ {
+ int m; // size
+ int pm; // packed size
+ double *pa; // pointer to a pm array of doubles, the first is aligned to cache line size
+ int memory_size; // size of needed memory
+ };
+
+struct s_strvec
+ {
+ int m; // size
+ int pm; // packed size
+ float *pa; // pointer to a pm array of floats, the first is aligned to cache line size
+ int memory_size; // size of needed memory
+ };
+
+#define DMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[((ai)-((ai)&(D_PS-1)))*(sA)->cn+(aj)*D_PS+((ai)&(D_PS-1))])
+#define SMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[((ai)-((ai)&(S_PS-1)))*(sA)->cn+(aj)*S_PS+((ai)&(S_PS-1))])
+#define DVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+#define SVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+// matrix structure
+struct d_strmat
+ {
+ int m; // rows
+ int n; // cols
+ double *pA; // pointer to a m*n array of doubles
+ double *dA; // pointer to a min(m,n) (or max???) array of doubles
+ int use_dA; // flag to tell if dA can be used
+ int memory_size; // size of needed memory
+ };
+
+struct s_strmat
+ {
+ int m; // rows
+ int n; // cols
+ float *pA; // pointer to a m*n array of floats
+ float *dA; // pointer to a min(m,n) (or max???) array of floats
+ int use_dA; // flag to tell if dA can be used
+ int memory_size; // size of needed memory
+ };
+
+// vector structure
+struct d_strvec
+ {
+ int m; // size
+ double *pa; // pointer to a m array of doubles, the first is aligned to cache line size
+ int memory_size; // size of needed memory
+ };
+
+struct s_strvec
+ {
+ int m; // size
+ float *pa; // pointer to a m array of floats, the first is aligned to cache line size
+ int memory_size; // size of needed memory
+ };
+
+#define DMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[(ai)+(aj)*(sA)->m])
+#define SMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[(ai)+(aj)*(sA)->m])
+#define DVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+#define SVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+#endif // BLASFEO_COMMON
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_d_aux.h b/include/blasfeo_d_aux.h
new file mode 100644
index 0000000..c4f71ee
--- /dev/null
+++ b/include/blasfeo_d_aux.h
@@ -0,0 +1,138 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_lib.c
+************************************************/
+
+// returns the memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n);
+// returns the memory size (in bytes) needed for the diagonal of a strmat
+int d_size_diag_strmat(int m, int n);
+// returns the memory size (in bytes) needed for a strvec
+int d_size_strvec(int m);
+// create a strmat for a matrix of size m*n by using memory passed by a pointer (pointer is not updated)
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory);
+// create a strvec for a vector of size m by using memory passed by a pointer (pointer is not updated)
+void d_create_strvec(int m, struct d_strvec *sA, void *memory);
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj);
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai);
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj);
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda);
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a);
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda);
+void d_cast_mat2strmat(double *A, struct d_strmat *sA);
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA);
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa);
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj);
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj);
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi);
+double dvecex1_libstr(struct d_strvec *sx, int xi);
+// A <= alpha
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj);
+// a <= alpha
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi);
+void dgecp_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb);
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj);
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci);
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai);
+void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb);
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb);
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci);
+void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd);
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj);
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd);
+void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x);
+void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd);
+void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd);
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj);
+void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x);
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi);
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi);
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd);
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj);
+void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd);
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj);
+void drowin_lib(int kmax, double alpha, double *x, double *pD);
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void drowex_lib(int kmax, double alpha, double *pD, double *x);
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi);
+void drowad_lib(int kmax, double alpha, double *x, double *pD);
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD);
+void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD);
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj);
+void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD);
+void drowsw_lib(int kmax, double *pA, double *pC);
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA);
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi);
+void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd);
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd);
+void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd);
+void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd);
+void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA);
+void dvecin_libsp(int kmax, int *idx, double *x, double *y);
+void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y);
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi);
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi);
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int x, struct d_strvec *sz, int zi);
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi);
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi);
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei);
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_d_aux_ext_dep.h b/include/blasfeo_d_aux_ext_dep.h
new file mode 100644
index 0000000..7b0222b
--- /dev/null
+++ b/include/blasfeo_d_aux_ext_dep.h
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_extern_depend_lib.c
+************************************************/
+
+/* column-major matrices */
+
+// dynamically allocate row*col doubles of memory and set accordingly a pointer to double; set allocated memory to zero
+void d_zeros(double **pA, int row, int col);
+// dynamically allocate row*col doubles of memory aligned to 64-byte boundaries and set accordingly a pointer to double; set allocated memory to zero
+void d_zeros_align(double **pA, int row, int col);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to double; set allocated memory to zero
+void d_zeros_align_bytes(double **pA, int size);
+// free the memory allocated by d_zeros
+void d_free(double *pA);
+// free the memory allocated by d_zeros_align or d_zeros_align_bytes
+void d_free_align(double *pA);
+// print a column-major matrix
+void d_print_mat(int m, int n, double *A, int lda);
+// print the transposed of a column-major matrix
+void d_print_tran_mat(int row, int col, double *A, int lda);
+// print to file a column-major matrix
+void d_print_to_file_mat(FILE *file, int row, int col, double *A, int lda);
+// print to file the transposed of a column-major matrix
+void d_print_tran_to_file_mat(FILE *file, int row, int col, double *A, int lda);
+// print in exponential notation a column-major matrix
+void d_print_e_mat(int m, int n, double *A, int lda);
+// print in exponential notation the transposed of a column-major matrix
+void d_print_e_tran_mat(int row, int col, double *A, int lda);
+
+/* strmat and strvec */
+
+#ifdef BLASFEO_COMMON
+// create a strmat for a matrix of size m*n by dynamically allocating memory
+void d_allocate_strmat(int m, int n, struct d_strmat *sA);
+// create a strvec for a vector of size m by dynamically allocating memory
+void d_allocate_strvec(int m, struct d_strvec *sa);
+// free the memory allocated by d_allocate_strmat
+void d_free_strmat(struct d_strmat *sA);
+// free the memory allocated by d_allocate_strvec
+void d_free_strvec(struct d_strvec *sa);
+// print a strmat
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj);
+// print in exponential notation a strmat
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj);
+// print to file a strmat
+void d_print_to_file_strmat(FILE *file, int m, int n, struct d_strmat *sA, int ai, int aj);
+// print a strvec
+void d_print_strvec(int m, struct d_strvec *sa, int ai);
+// print in exponential notation a strvec
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai);
+// print to file a strvec
+void d_print_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai);
+// print the transposed of a strvec
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai);
+// print in exponential notation the transposed of a strvec
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai);
+// print to file the transposed of a strvec
+void d_print_tran_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai);
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/include/blasfeo_d_blas.h b/include/blasfeo_d_blas.h
new file mode 100644
index 0000000..a473322
--- /dev/null
+++ b/include/blasfeo_d_blas.h
@@ -0,0 +1,159 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+//
+// level 1 BLAS
+//
+
+// y = y + alpha*x
+void daxpy_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// z = x .* y, return sum(z) = x^T * y
+double dvecmuldot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// return x^T * y
+double ddot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi);
+
+
+
+//
+// level 2 BLAS
+//
+
+// dense
+
+// z <= beta * y + alpha * A * x
+void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// z <= beta * y + alpha * A' * x
+void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(n)
+void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(n)
+void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, not_unit
+void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, unit
+void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, not_unit
+void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, unit
+void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, not_transposed, not_unit
+void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
+void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= beta * y + alpha * A * x ; A upper triangular
+void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= A' * x ; A upper triangular
+void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= A * x ; A lower triangular
+void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z_n <= beta_n * y_n + alpha_n * A * x_n
+// z_t <= beta_t * y_t + alpha_t * A' * x_t
+void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t);
+// z <= beta * y + alpha * A * x, where A is symmetric and only the lower triangular patr of A is accessed
+void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+
+// diagonal
+
+// z <= beta * y + alpha * A * x, A diagonal
+void dgemv_diag_libstr(int m, double alpha, struct d_strvec *sA, int ai, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+
+
+
+//
+// level 3 BLAS
+//
+
+// dense
+
+// D <= beta * C + alpha * A * B^T
+void dgemm_nt_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B
+void dgemm_nn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B^T ; C, D lower triangular
+void dsyrk_ln_libstr(int m, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+void dsyrk_ln_mn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^T ; B upper triangular
+void dtrmm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A ; A lower triangular
+void dtrmm_rlnn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular employing explicit inverse of diagonal
+void dtrsm_rltn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular with unit diagonal
+void dtrsm_rltu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A lower triangular with unit diagonal
+void dtrsm_llnu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_lunn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+
+// diagonal
+
+// D <= alpha * A * B + beta * C, with A diagonal (stored as strvec)
+void dgemm_diag_left_ib(int m, int n, double alpha, double *dA, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd);
+void dgemm_l_diag_libstr(int m, int n, double alpha, struct d_strvec *sA, int ai, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * A * B + beta * C, with B diagonal (stored as strvec)
+void dgemm_r_diag_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sB, int bi, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+
+
+
+//
+// LAPACK
+//
+
+// D <= chol( C ) ; C, D lower triangular
+void dpotrf_l_libstr(int m, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+void dpotrf_l_mn_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= chol( C + A * B' ) ; C, D lower triangular
+void dsyrk_dpotrf_ln_libstr(int m, int n, int k, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= lu( C ) ; no pivoting
+void dgetrf_nopivot_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= lu( C ) ; pivoting
+void dgetrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, int *ipiv);
+// D <= qr( C )
+void dgeqrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work);
+int dgeqrf_work_size_libstr(int m, int n); // in bytes
+// D <= lq( C )
+void dgelqf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work);
+int dgelqf_work_size_libstr(int m, int n); // in bytes
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_d_kernel.h b/include/blasfeo_d_kernel.h
new file mode 100644
index 0000000..6f045af
--- /dev/null
+++ b/include/blasfeo_d_kernel.h
@@ -0,0 +1,308 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// level 2 BLAS
+// 12
+void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+// 8
+void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+// 4
+void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *C, double *D, int km);
+void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *z, int km);
+void kernel_dgemv_nt_6_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *z);
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+
+
+// level 3 BLAS
+// 12x4
+void kernel_dgemm_nt_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nt_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dgemm_nn_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nn_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dsyrk_nt_l_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dsyrk_nt_l_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dtrmm_nt_ru_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dtrmm_nn_rl_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+void kernel_dtrmm_nn_rl_12x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_one_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+void kernel_dtrsm_nt_rl_one_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+void kernel_dtrsm_nt_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ll_one_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+void kernel_dtrsm_nn_ll_one_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+void kernel_dtrsm_nn_lu_inv_12x4_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+void kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 4x12
+void kernel_dgemm_nt_4x12_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_4x12_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dgemm_nn_4x12_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dtrsm_nt_rl_inv_4x12_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E, int km, int kn);
+// 8x8
+void kernel_dgemm_nt_8x8l_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); // computes [A00 *; A10 A11]
+void kernel_dgemm_nt_8x8u_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); // computes [A00 *; A10 A11]
+void kernel_dgemm_nt_8x8l_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); // computes [A00 *; A10 A11]
+void kernel_dgemm_nt_8x8u_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); // computes [A00 *; A10 A11]
+void kernel_dsyrk_nt_l_8x8_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); // computes [L00 *; A10 L11]
+void kernel_dsyrk_nt_l_8x8_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); // computes [L00 *; A10 L11]
+void kernel_dtrsm_nt_rl_inv_8x8l_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_8x8u_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E, int km, int kn);
+// 8x4
+void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nt_8x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int k0, int k1);
+void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dsyrk_nt_l_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dsyrk_nt_l_8x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int k0, int k1);
+void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dtrmm_nt_ru_8x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+void kernel_dtrmm_nn_rl_8x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+void kernel_dtrsm_nt_rl_one_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+void kernel_dtrsm_nn_lu_inv_8x4_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 4x8
+void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dtrsm_nt_rl_inv_4x8_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E, int km, int kn);
+// 4x4
+void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+// diag
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd);
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D);
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+// low rank update
+void kernel_dger4_sub_12r_lib4(int k, double *A, int sda, double *B, double *C, int sdc);
+void kernel_dger4_sub_12r_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km);
+void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C, int sdc);
+void kernel_dger4_sub_8r_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km);
+void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C);
+void kernel_dger4_sub_4r_vs_lib4(int n, double *A, double *B, double *C, int km);
+
+
+
+// LAPACK
+// 12x4
+void kernel_dpotrf_nt_l_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_l_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_m_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_r_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_l_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_m_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_r_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 8x8
+void kernel_dpotrf_nt_l_8x8_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_8x8_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 8x4
+void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_l_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_r_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_l_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_r_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 4x4
+void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+#endif
+void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv);
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv);
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD);
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD);
+void kernel_dlarf_4_lib4(int m, int n, double *pV, int sdv, double *tau, double *pC, int sdc); // rank-4 reflector
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW);
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD);
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD);
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT);
+void kernel_dgelqf_dlarft12_12_lib4(int n, double *pD, int sdd, double *dD, double *pT);
+void kernel_dgelqf_dlarft4_12_lib4(int n, double *pD, int sdd, double *dD, double *pT);
+void kernel_dgelqf_dlarft4_8_lib4(int n, double *pD, int sdd, double *dD, double *pT);
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT);
+void kernel_dlarfb12_r_4_lib4(int kmax, double *pV, int sdd, double *pT, double *pD, double *pK, int km);
+void kernel_dlarfb4_r_12_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD);
+
+
+
+// merged routines
+// 12x4
+void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dsyrk_dpotrf_nt_l_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+// 4x12
+void kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km_, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 8x8
+void kernel_dsyrk_dpotrf_nt_l_8x8_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdb, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdb, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 8x4
+void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 4x8
+void kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km_, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 4x4
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+
+
+// auxiliary routines
+void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgeset_4_lib4(int kmax, double alpha, double *A);
+void kernel_dtrset_4_lib4(int kmax, double alpha, double *A);
+void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A, int sda, double *C, int sdc);
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_4_0_lib4(int m, double *A, int sda, double *B);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_i_aux_ext_dep.h b/include/blasfeo_i_aux_ext_dep.h
new file mode 100644
index 0000000..5f47088
--- /dev/null
+++ b/include/blasfeo_i_aux_ext_dep.h
@@ -0,0 +1,60 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// i_aux_extern_depend_lib
+void int_zeros(int **pA, int row, int col);
+void int_zeros_align(int **pA, int row, int col);
+void int_free(int *pA);
+void int_free_align(int *pA);
+void int_print_mat(int row, int col, int *A, int lda);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/include/blasfeo_m_aux.h b/include/blasfeo_m_aux.h
new file mode 100644
index 0000000..bbaac28
--- /dev/null
+++ b/include/blasfeo_m_aux.h
@@ -0,0 +1,45 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi);
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi);
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis);
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_s_aux.h b/include/blasfeo_s_aux.h
new file mode 100644
index 0000000..d93509f
--- /dev/null
+++ b/include/blasfeo_s_aux.h
@@ -0,0 +1,137 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_lib.c
+************************************************/
+
+// returns the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n);
+// returns the memory size (in bytes) needed for the diagonal of a strmat
+int s_size_diag_strmat(int m, int n);
+// returns the memory size (in bytes) needed for a strvec
+int s_size_strvec(int m);
+// create a strmat for a matrix of size m*n by using memory passed by a pointer (pointer is not updated)
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory);
+// create a strvec for a vector of size m by using memory passed by a pointer (pointer is not updated)
+void s_create_strvec(int m, struct s_strvec *sA, void *memory);
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj);
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai);
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj);
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda);
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a);
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda);
+void s_cast_mat2strmat(float *A, struct s_strmat *sA);
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA);
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa);
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj);
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj);
+void svecin1_libstr(float a, struct s_strvec *sx, int xi);
+float svecex1_libstr(struct s_strvec *sx, int xi);
+// A <= alpha
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj);
+// a <= alpha
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi);
+void sgecp_lib(int m, int n, float alpha, int offsetA, float *A, int sda, int offsetB, float *B, int sdb);
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj);
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci);
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai);
+void strcp_l_lib(int m, float alpha, int offsetA, float *A, int sda, int offsetB, float *B, int sdb);
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void sgead_lib(int m, int n, float alpha, int offsetA, float *A, int sda, int offsetB, float *B, int sdb);
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci);
+void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd);
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi);
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd);
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x);
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd);
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd);
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj);
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x);
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi);
+void sdiaad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd);
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj);
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd);
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj);
+void srowin_lib(int kmax, float alpha, float *x, float *pD);
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void srowex_lib(int kmax, float alpha, float *pD, float *x);
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi);
+void srowad_lib(int kmax, float alpha, float *x, float *pD);
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD);
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD);
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj);
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD);
+void srowsw_lib(int kmax, float *pA, float *pC);
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA);
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd);
+void scolin_libstr(int kmax, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd);
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd);
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd);
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA);
+void svecin_libsp(int kmax, int *idx, float *x, float *y);
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y);
+void svecad_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi);
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi);
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int x, struct s_strvec *sz, int zi);
+void sveccl_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi);
+void sveccl_mask_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi, struct s_strvec *sm, int mi);
+void svecze_libstr(int m, struct s_strvec *sm, int mi, struct s_strvec *sv, int vi, struct s_strvec *se, int ei);
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/include/blasfeo_s_aux_ext_dep.h b/include/blasfeo_s_aux_ext_dep.h
new file mode 100644
index 0000000..2b9f9d4
--- /dev/null
+++ b/include/blasfeo_s_aux_ext_dep.h
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_extern_depend_lib.c
+************************************************/
+
+/* column-major matrices */
+
+// dynamically allocate row*col floats of memory and set accordingly a pointer to float; set allocated memory to zero
+void s_zeros(float **pA, int row, int col);
+// dynamically allocate row*col floats of memory aligned to 64-byte boundaries and set accordingly a pointer to float; set allocated memory to zero
+void s_zeros_align(float **pA, int row, int col);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to float; set allocated memory to zero
+void s_zeros_align_bytes(float **pA, int size);
+// free the memory allocated by d_zeros
+void s_free(float *pA);
+// free the memory allocated by d_zeros_align or d_zeros_align_bytes
+void s_free_align(float *pA);
+// print a column-major matrix
+void s_print_mat(int m, int n, float *A, int lda);
+// print the transposed of a column-major matrix
+void s_print_tran_mat(int row, int col, float *A, int lda);
+// print to file a column-major matrix
+void s_print_to_file_mat(FILE *file, int row, int col, float *A, int lda);
+// print to file the transposed of a column-major matrix
+void s_print_tran_to_file_mat(FILE *file, int row, int col, float *A, int lda);
+// print in exponential notation a column-major matrix
+void s_print_e_mat(int m, int n, float *A, int lda);
+// print in exponential notation the transposed of a column-major matrix
+void s_print_e_tran_mat(int row, int col, float *A, int lda);
+
+/* strmat and strvec */
+
+#ifdef BLASFEO_COMMON
+// create a strmat for a matrix of size m*n by dynamically allocating memory
+void s_allocate_strmat(int m, int n, struct s_strmat *sA);
+// create a strvec for a vector of size m by dynamically allocating memory
+void s_allocate_strvec(int m, struct s_strvec *sa);
+// free the memory allocated by d_allocate_strmat
+void s_free_strmat(struct s_strmat *sA);
+// free the memory allocated by d_allocate_strvec
+void s_free_strvec(struct s_strvec *sa);
+// print a strmat
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj);
+// print in exponential notation a strmat
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj);
+// print to file a strmat
+void s_print_to_file_strmat(FILE *file, int m, int n, struct s_strmat *sA, int ai, int aj);
+// print a strvec
+void s_print_strvec(int m, struct s_strvec *sa, int ai);
+// print in exponential notation a strvec
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai);
+// print to file a strvec
+void s_print_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai);
+// print the transposed of a strvec
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai);
+// print in exponential notation the transposed of a strvec
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai);
+// print to file the transposed of a strvec
+void s_print_tran_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai);
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/include/blasfeo_s_blas.h b/include/blasfeo_s_blas.h
new file mode 100644
index 0000000..a0170a5
--- /dev/null
+++ b/include/blasfeo_s_blas.h
@@ -0,0 +1,160 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+//
+// level 1 BLAS
+//
+
+// y = y + alpha*x
+void saxpy_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// z = x .* y, return sum(z) = x^T * y
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// return x^T * y
+float sdot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi);
+
+
+
+//
+// level 2 BLAS
+//
+
+// dense
+
+// z <= beta * y + alpha * A * x
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// z <= beta * y + alpha * A' * x
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(n)
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(n)
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, not_unit
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, unit
+void strsv_lnu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, not_unit
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, unit
+void strsv_ltu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, not_transposed, not_unit
+void strsv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
+void strsv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= beta * y + alpha * A * x ; A upper triangular
+void strmv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= A' * x ; A upper triangular
+void strmv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= A * x ; A lower triangular
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z_n <= beta_n * y_n + alpha_n * A * x_n
+// z_t <= beta_t * y_t + alpha_t * A' * x_t
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t);
+// z <= beta * y + alpha * A * x, where A is symmetric and only the lower triangular patr of A is accessed
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+
+// diagonal
+
+// z <= beta * y + alpha * A * x, A diagonal
+void sgemv_diag_libstr(int m, float alpha, struct s_strvec *sA, int ai, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+
+
+
+//
+// level 3 BLAS
+//
+
+// dense
+
+// D <= beta * C + alpha * A * B^T
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B^T ; C, D lower triangular
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^T ; B upper triangular
+void strmm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A ; A lower triangular
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular employing explicit inverse of diagonal
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular with unit diagonal
+void strsm_rltu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void strsm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A lower triangular with unit diagonal
+void strsm_llnu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void strsm_lunn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+
+// diagonal
+
+// D <= alpha * A * B + beta * C, with A diagonal (stored as strvec)
+void sgemm_diag_left_ib(int m, int n, float alpha, float *dA, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd);
+void sgemm_l_diag_libstr(int m, int n, float alpha, struct s_strvec *sA, int ai, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * A * B + beta * C, with B diagonal (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+
+
+
+//
+// LAPACK
+//
+
+// D <= chol( C ) ; C, D lower triangular
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= chol( C + A * B' ) ; C, D lower triangular
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= lu( C ) ; no pivoting
+void sgetrf_nopivot_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= lu( C ) ; pivoting
+void sgetrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, int *ipiv);
+// D <= qr( C )
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work);
+int sgeqrf_work_size_libstr(int m, int n); // in bytes
+// D <= lq( C )
+void sgelqf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work);
+int sgelqf_work_size_libstr(int m, int n); // in bytes
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_s_kernel.h b/include/blasfeo_s_kernel.h
new file mode 100644
index 0000000..c0dc2b0
--- /dev/null
+++ b/include/blasfeo_s_kernel.h
@@ -0,0 +1,355 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// lib8
+//
+
+// 24x4
+void kernel_sgemm_nt_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nt_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nt_24x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nn_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nn_24x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_ssyrk_nt_l_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_ssyrk_nt_l_20x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_20x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_spotrf_nt_l_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_spotrf_nt_l_20x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_20x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_20x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_strmm_nn_rl_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+void kernel_strmm_nn_rl_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+// 16x4
+void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nn_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+// 8x8
+void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+void kernel_sgemm_nn_8x8_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nn_8x8_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+// 8x4
+void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+void kernel_sgemm_nn_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nn_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+//void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+// 4x8
+void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_strsm_nt_rl_inv_4x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_4x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+// 8
+void kernel_sgemv_n_8_lib8(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_n_8_vs_lib8(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_n_8_gen_lib8(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1);
+void kernel_sgemv_t_8_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_t_8_vs_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_t_8_gen_lib8(int k, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *C, float *D, int km);
+void kernel_sgemv_t_4_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_t_4_vs_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_t_4_gen_lib8(int k, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *C, float *D, int km);
+void kernel_strsv_ln_inv_8_lib8(int k, float *A, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_ln_inv_8_vs_lib8(int k, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn);
+void kernel_strsv_lt_inv_8_lib8(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_8_vs_lib8(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z, int km, int kn);
+void kernel_sgemv_nt_4_lib8(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t);
+void kernel_sgemv_nt_4_vs_lib8(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km);
+void kernel_ssymv_l_4l_lib8(int kmax, float *alpha, float *A, int sda, float *x, float *z);
+void kernel_ssymv_l_4r_lib8(int kmax, float *alpha, float *A, int sda, float *x, float *z);
+void kernel_ssymv_l_4l_gen_lib8(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *z, int km);
+void kernel_ssymv_l_4r_gen_lib8(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *z, int km);
+
+// aux
+void kernel_sgecp_8_0_lib8(int m, float *A, float *B);
+void kernel_sgecp_8_0_gen_lib8(int m, float *A, float *B, int m1);
+void kernel_sgecp_8_1_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_1_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_2_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_2_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_3_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_3_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_4_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_4_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_5_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_5_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_6_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_6_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_7_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_7_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgesc_8_lib8(int m, float *alpha, float *A);
+void kernel_sgesc_8_gen_lib8(int m, float *alpha, float *A, int m1);
+void kernel_sgetr_8_0_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_0_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_1_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_1_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_2_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_2_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_3_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_3_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_4_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_4_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_5_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_5_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_6_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_6_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_7_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_7_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_0_lib8(int m, float *alpha, float *A, float *B);
+void kernel_sgead_8_0_gen_lib8(int m, float *alpha, float *A, float *B, int m1);
+void kernel_sgead_8_1_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_1_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_2_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_2_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_3_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_3_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_4_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_4_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_5_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_5_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_6_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_6_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_7_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_7_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+
+
+//
+// lib4
+//
+
+
+
+// level 2 BLAS
+// 4
+void kernel_sgemv_n_4_lib4(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_n_4_vs_lib4(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1);
+void kernel_sgemv_t_4_lib4(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_t_4_vs_lib4(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_t_4_gen_lib4(int k, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *C, float *D, int km);
+void kernel_strsv_ln_inv_4_lib4(int k, float *A, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_ln_inv_4_vs_lib4(int k, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn);
+void kernel_strsv_lt_inv_4_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_3_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_2_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_1_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strmv_un_4_lib4(int k, float *A, float *x, float *z);
+void kernel_strmv_ut_4_lib4(int k, float *A, int sda, float *x, float *z);
+void kernel_strmv_ut_4_vs_lib4(int k, float *A, int sda, float *x, float *z, int km);
+void kernel_sgemv_nt_6_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t);
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t);
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km);
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n);
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km);
+
+
+
+// level 3 BLAS
+// 12x4
+void kernel_sgemm_nt_12x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); //
+// 8x8
+void kernel_sgemm_nt_8x8_lib4(int k, float *alpha, float *A, int sda, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); //
+// 8x4
+void kernel_sgemm_nt_8x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); //
+// 4x4
+void kernel_sgemm_nt_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); //
+void kernel_sgemm_nt_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); //
+void kernel_sgemm_nt_4x4_gen_lib4(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int k0, int k1);
+void kernel_sgemm_nn_4x4_lib4(int k, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D); //
+void kernel_sgemm_nn_4x4_vs_lib4(int k, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn); //
+void kernel_ssyrk_nt_l_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); //
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); //
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); //
+void kernel_strmm_nt_ru_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); //
+void kernel_strmm_nn_rl_4x4_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+void kernel_strmm_nn_rl_4x4_gen_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E);
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *E, int km, int kn);
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_strsm_nn_ru_inv_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_strsm_nn_ll_one_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E);
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn);
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+// diag
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd);
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D);
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+
+
+// LAPACK
+// 4x4
+void kernel_spotrf_nt_l_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+void kernel_spotrf_nt_l_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_sgetrf_nn_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D);
+void kernel_sgetrf_nn_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv);
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv);
+
+
+
+// merged routines
+// 4x4
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+
+
+// auxiliary routines
+void kernel_sgesc_4_lib4(int kmax, float *alpha, float *A);
+void kernel_sgesc_3_lib4(int kmax, float *alpha, float *A);
+void kernel_sgesc_2_lib4(int kmax, float *alpha, float *A);
+void kernel_sgesc_1_lib4(int kmax, float *alpha, float *A);
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B);
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B);
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B);
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_2_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B);
+void kernel_sgead_4_0_lib4(int kmax, float *alpha, float *A, float *B);
+void kernel_sgead_4_1_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_4_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_4_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_3_0_lib4(int kmax, float *alpha, float *A, float *B);
+void kernel_sgead_3_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_3_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_2_0_lib4(int kmax, float *alpha, float *A, float *B);
+void kernel_sgead_2_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_1_0_lib4(int kmax, float *alpha, float *A, float *B);
+// TODO
+void kernel_sgeset_4_lib4(int kmax, float alpha, float *A);
+void kernel_strset_4_lib4(int kmax, float alpha, float *A);
+void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_v_aux_ext_dep.h b/include/blasfeo_v_aux_ext_dep.h
new file mode 100644
index 0000000..2555fab
--- /dev/null
+++ b/include/blasfeo_v_aux_ext_dep.h
@@ -0,0 +1,71 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_extern_depend_lib.c
+************************************************/
+
+void v_zeros(void **ptrA, int size);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to void; set allocated memory to zero
+void v_zeros_align(void **ptrA, int size);
+// free the memory allocated by v_zeros
+void v_free(void *ptrA);
+// free the memory allocated by v_zeros_aligned
+void v_free_align(void *ptrA);
+// dynamically allocate size bytes of memory and set accordingly a pointer to char; set allocated memory to zero
+void c_zeros(char **ptrA, int size);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to char; set allocated memory to zero
+void c_zeros_align(char **ptrA, int size);
+// free the memory allocated by c_zeros
+void c_free(char *ptrA);
+// free the memory allocated by c_zeros_aligned
+void c_free_align(char *ptrA);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..60e1f31
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,75 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+obj:
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+ ( cd avx2; $(MAKE) obj)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+ ( cd sse3; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+ ( cd fma; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+ ( cd armv8a; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+ ( cd armv7a; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), GENERIC)
+ ( cd c99; $(MAKE) obj)
+endif
+
+clean:
+ make -C avx2 clean
+ make -C avx clean
+ make -C sse3 clean
+ make -C fma clean
+ make -C armv8a clean
+ make -C armv7a clean
+ make -C c99 clean
+
diff --git a/kernel/armv7a/Makefile b/kernel/armv7a/Makefile
new file mode 100644
index 0000000..4cb59a7
--- /dev/null
+++ b/kernel/armv7a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_12x4_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/armv7a/kernel_dgemm_4x4_lib4.S b/kernel/armv7a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..86aee4f
--- /dev/null
+++ b/kernel/armv7a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,3223 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // unroll 0
+ fmacd d0, d16, d20
+ pld [r5, #128] // prefetch
+ fmacd d1, d17, d20
+ pld [r6, #128] // prefetch
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+
+
+ // unroll 2
+ fmacd d0, d16, d20
+ pld [r6, #192] // prefetch
+ fmacd d1, d17, d20
+ add r6, r6, #128
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fmacd d4, d16, d21
+ pld [r5, #192] // prefetch
+ fmacd d5, d17, d21
+ add r5, r5, #128
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #0] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #8] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #16] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fmacd d8, d24, d30
+ sub r4, r4, #4
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #32] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #40] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #48] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+
+ fmacd d12, d24, d31
+ fmacd d13, d25, d31
+ fmacd d14, d26, d31
+ fmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- 4*sdb*sizeof(double)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, %function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+ pld [r6, #64]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #32]
+ fldd d22, [r6, #64]
+ fldd d23, [r6, #96]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #8]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #72]
+ fldd d31, [r6, #104]
+
+ // prefetch
+ pld [r5, #64]
+
+ // B next
+ add r9, r7, r6
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // unroll 0
+ fmacd d0, d16, d20
+ pld [r5, #128] // prefetch
+ fmacd d1, d17, d20
+ pld [r9, #0]
+ fmacd d2, d18, d20
+ pld [r9, #64]
+ fmacd d3, d19, d20
+ fldd d20, [r6, #16] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #48] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #112] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #24] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #56] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #88] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // unroll 2
+ fmacd d0, d16, d20
+ pld [r5, #192] // prefetch
+ fmacd d1, d17, d20
+ mov r6, r9
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fmacd d4, d16, d21
+ add r5, r5, #128
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #32] // B
+
+ fmacd d8, d16, d22
+ add r9, r9, r7
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #64] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #0] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #8] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #16] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #96] // B
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #8] // B
+
+ fmacd d4, d24, d29
+ sub r4, r4, #4
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #72] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #32] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #40] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #48] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #104] // B
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #16] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #48] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #112] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #24] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #56] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #88] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ mov r6, r9
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+
+ fmacd d12, d24, d31
+ fmacd d13, d25, d31
+ fmacd d14, d26, d31
+ fmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #32] // B
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #64] // B
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #96] // B
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #8
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dsyrk_l_add_nt_4x4_lib4, %function
+inner_kernel_dsyrk_l_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dsyrk_l_add_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #128]
+ pld [r6, #128]
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // prefetch
+ pld [r5, #192]
+ pld [r6, #192]
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fldd d16, [r5, #0] // A
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fmacd d5, d17, d21
+ fldd d17, [r5, #8] // A
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fmacd d10, d18, d22
+ fldd d18, [r5, #16] // A
+ fmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fldd d24, [r5, #32] // A
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fmacd d5, d25, d29
+ fldd d25, [r5, #40] // A
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fmacd d10, d26, d30
+ fldd d26, [r5, #48] // A
+ fmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ sub r4, r4, #4
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fmacd d15, d19, d23
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+
+ fmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dsyrk_l_add_nt_4x4_lib4, .-inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, %function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #128]
+ pld [r6, #128]
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fnmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fnmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fnmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fnmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // prefetch
+ pld [r5, #192]
+ pld [r6, #192]
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fnmacd d12, d16, d23
+ fldd d16, [r5, #0] // A
+ fnmacd d13, d17, d23
+ fldd d17, [r5, #8] // A
+ fnmacd d14, d18, d23
+ fldd d18, [r5, #16] // A
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fnmacd d12, d24, d31
+ fldd d24, [r5, #32] // A
+ fnmacd d13, d25, d31
+ fldd d25, [r5, #40] // A
+ fnmacd d14, d26, d31
+ fldd d26, [r5, #48] // A
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ sub r4, r4, #4
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fnmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fnmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fnmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fnmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fnmacd d12, d16, d23
+ fnmacd d13, d17, d23
+ fnmacd d14, d18, d23
+ fnmacd d15, d19, d23
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+
+ fnmacd d12, d24, d31
+ fnmacd d13, d25, d31
+ fnmacd d14, d26, d31
+ fnmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fnmacd d12, d16, d23
+ fnmacd d13, d17, d23
+ fnmacd d14, d18, d23
+ fnmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dsyrk_l_sub_nt_4x4_lib4, %function
+inner_kernel_dsyrk_l_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dsyrk_l_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #128]
+ pld [r6, #128]
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // prefetch
+ pld [r5, #192]
+ pld [r6, #192]
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fldd d16, [r5, #0] // A
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fnmacd d5, d17, d21
+ fldd d17, [r5, #8] // A
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fnmacd d10, d18, d22
+ fldd d18, [r5, #16] // A
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fldd d24, [r5, #32] // A
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fnmacd d5, d25, d29
+ fldd d25, [r5, #40] // A
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fnmacd d10, d26, d30
+ fldd d26, [r5, #48] // A
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ sub r4, r4, #4
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fnmacd d15, d19, d23
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+
+ fnmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fnmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dsyrk_l_sub_nt_4x4_lib4, .-inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- bs*sdb*sizeof(double)
+// r8 <- offsetB
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, %function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmp r8, #0
+ ble 2f // return
+
+ cmp r4, #0
+ ble 2f // return
+
+ rsb r9, r8, #4 // 4-offsetB
+ cmp r9, r4
+// ble 0f
+// mov r9, r4 // kend=min(k,4-offsetB(
+//0:
+ movgt r9, r4 // kend=min(k,4-offsetB(
+
+// lsl r10, r8, #3 // offsetB*sizeof(double)
+ add r6, r6, r8, LSL #3 // B + offsetB*sizeof(double)
+
+1:
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #32] // B
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #64] // B
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #96] // B
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ sub r4, r4, #1
+ sub r9, r9, #1
+ add r5, r5, #32
+ add r6, r6, #8
+
+ cmp r9, #0
+ bgt 1b
+
+ cmp r4, #0
+ ble 2f // return
+
+ add r6, r6, r7
+ sub r6, r6, #32
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r4 <- E
+// r5 <- inv_diag_E
+//
+// output arguments:
+// r4 <- E
+// r5 <- inv_diag_E
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, %function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ fldd d16, [r5, #0] // E_inv[0]
+ fmuld d0, d0, d16
+ fmuld d1, d1, d16
+ fmuld d2, d2, d16
+ fmuld d3, d3, d16
+
+ // second column
+ fldd d16, [r4, #8] // E[1+4*0]
+ fnmacd d4, d0, d16
+ fnmacd d5, d1, d16
+ fnmacd d6, d2, d16
+ fnmacd d7, d3, d16
+ fldd d16, [r5, #8] // E_inv[1]
+ fmuld d4, d4, d16
+ fmuld d5, d5, d16
+ fmuld d6, d6, d16
+ fmuld d7, d7, d16
+
+ // third column
+ fldd d16, [r4, #16] // E[2+4*0]
+ fnmacd d8, d0, d16
+ fnmacd d9, d1, d16
+ fnmacd d10, d2, d16
+ fnmacd d11, d3, d16
+ fldd d16, [r4, #48] // E[2+4*1]
+ fnmacd d8, d4, d16
+ fnmacd d9, d5, d16
+ fnmacd d10, d6, d16
+ fnmacd d11, d7, d16
+ fldd d16, [r5, #16] // E_inv[2]
+ fmuld d8, d8, d16
+ fmuld d9, d9, d16
+ fmuld d10, d10, d16
+ fmuld d11, d11, d16
+
+ // fourth column
+ fldd d16, [r4, #24] // E[3+4*0]
+ fnmacd d12, d0, d16
+ fnmacd d13, d1, d16
+ fnmacd d14, d2, d16
+ fnmacd d15, d3, d16
+ fldd d16, [r4, #56] // E[3+4*1]
+ fnmacd d12, d4, d16
+ fnmacd d13, d5, d16
+ fnmacd d14, d6, d16
+ fnmacd d15, d7, d16
+ fldd d16, [r4, #88] // E[3+4*2]
+ fnmacd d12, d8, d16
+ fnmacd d13, d9, d16
+ fnmacd d14, d10, d16
+ fnmacd d15, d11, d16
+ fldd d16, [r5, #24] // E_inv[3]
+ fmuld d12, d12, d16
+ fmuld d13, d13, d16
+ fmuld d14, d14, d16
+ fmuld d15, d15, d16
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// cholesky factorization
+//
+// input arguments:
+// r4 <- inv_diag_D
+//
+// output arguments:
+// r4 <- inv_diag_D
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_lib4, %function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+
+ fconstd d16, #112 // 1.0
+ fldd d17, .LC01 // 0.0
+
+ // first column
+ fcmped d0, d17
+ fmstat
+ ble 1f
+ fsqrtd d0, d0
+ fdivd d18, d16, d0
+ fstd d18, [r4, #0]
+2:
+ fmuld d1, d1, d18
+ fmuld d2, d2, d18
+ fmuld d3, d3, d18
+
+ // second column
+ fnmacd d5, d1, d1
+ fnmacd d6, d1, d2
+ fnmacd d7, d1, d3
+ fcmped d5, d17
+ fmstat
+ ble 3f
+ fsqrtd d5, d5
+ fdivd d18, d16, d5
+ fstd d18, [r4, #8]
+4:
+ fmuld d6, d6, d18
+ fmuld d7, d7, d18
+
+ // third column
+ fnmacd d10, d2, d2
+ fnmacd d11, d2, d3
+ fnmacd d10, d6, d6
+ fnmacd d11, d6, d7
+ fcmped d10, d17
+ fmstat
+ ble 5f
+ fsqrtd d10, d10
+ fdivd d18, d16, d10
+ fstd d18, [r4, #16]
+6:
+ fmuld d11, d11, d18
+
+ // fourth column
+ fnmacd d15, d3, d3
+ fnmacd d15, d7, d7
+ fnmacd d15, d11, d11
+ fcmped d15, d17
+ fmstat
+ ble 7f
+ fsqrtd d15, d15
+ fdivd d18, d16, d15
+ fstd d18, [r4, #24]
+
+ b 0f
+
+1:
+ fldd d0, .LC01
+ b 2b
+
+3:
+ fldd d5, .LC01
+ b 4b
+
+5:
+ fldd d10, .LC01
+ b 6b
+
+7:
+ fldd d15, .LC01
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+ .align 3
+.LC01: // { 0 }
+ .word 0
+ .word 0
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ fldd d16, [r4, #0] // alpha
+
+ fmuld d0, d0, d16
+ fldd d18, [r5, #0] // beta
+ fmuld d1, d1, d16
+ fldd d17, .LC01 // 0.0
+ fmuld d2, d2, d16
+ fmuld d3, d3, d16
+
+ fmuld d4, d4, d16
+ fmuld d5, d5, d16
+ fmuld d6, d6, d16
+ fmuld d7, d7, d16
+
+ fmuld d8, d8, d16
+ fcmped d18, d17
+ fmuld d9, d9, d16
+ fmuld d10, d10, d16
+ fmuld d11, d11, d16
+
+ fmuld d12, d12, d16
+ fmstat
+ fmuld d13, d13, d16
+ fmuld d14, d14, d16
+ fmuld d15, d15, d16
+
+ beq 0f // end
+
+ fldd d17, [r6, #0] // C
+ fmacd d0, d18, d17
+ fldd d17, [r6, #8] // C
+ fmacd d1, d18, d17
+ fldd d17, [r6, #16] // C
+ fmacd d2, d18, d17
+ fldd d17, [r6, #24] // C
+ fmacd d3, d18, d17
+
+ fldd d17, [r6, #32] // C
+ fmacd d4, d18, d17
+ fldd d17, [r6, #40] // C
+ fmacd d5, d18, d17
+ fldd d17, [r6, #48] // C
+ fmacd d6, d18, d17
+ fldd d17, [r6, #56] // C
+ fmacd d7, d18, d17
+
+ fldd d17, [r6, #64] // C
+ fmacd d8, d18, d17
+ fldd d17, [r6, #72] // C
+ fmacd d9, d18, d17
+ fldd d17, [r6, #80] // C
+ fmacd d10, d18, d17
+ fldd d17, [r6, #88] // C
+ fmacd d11, d18, d17
+
+ fldd d17, [r6, #96] // C
+ fmacd d12, d18, d17
+ fldd d17, [r6, #104] // C
+ fmacd d13, d18, d17
+ fldd d17, [r6, #112] // C
+ fmacd d14, d18, d17
+ fldd d17, [r6, #120] // C
+ fmacd d15, d18, d17
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, %function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ fldd d17, [r4, #0] // C
+ faddd d0, d0, d17
+ fldd d17, [r4, #8] // C
+ faddd d1, d1, d17
+ fldd d17, [r4, #16] // C
+ faddd d2, d2, d17
+ fldd d17, [r4, #24] // C
+ faddd d3, d3, d17
+
+ fldd d17, [r4, #32] // C
+ faddd d4, d4, d17
+ fldd d17, [r4, #40] // C
+ faddd d5, d5, d17
+ fldd d17, [r4, #48] // C
+ faddd d6, d6, d17
+ fldd d17, [r4, #56] // C
+ faddd d7, d7, d17
+
+ fldd d17, [r4, #64] // C
+ faddd d8, d8, d17
+ fldd d17, [r4, #72] // C
+ faddd d9, d9, d17
+ fldd d17, [r4, #80] // C
+ faddd d10, d10, d17
+ fldd d17, [r4, #88] // C
+ faddd d11, d11, d17
+
+ fldd d17, [r4, #96] // C
+ faddd d12, d12, d17
+ fldd d17, [r4, #104] // C
+ faddd d13, d13, d17
+ fldd d17, [r4, #112] // C
+ faddd d14, d14, d17
+ fldd d17, [r4, #120] // C
+ faddd d15, d15, d17
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#endif
+#endif
+
+ fstd d0, [r4, #0]
+ fstd d1, [r4, #8]
+ fstd d2, [r4, #16]
+ fstd d3, [r4, #24]
+
+ fstd d4, [r4, #32]
+ fstd d5, [r4, #40]
+ fstd d6, [r4, #48]
+ fstd d7, [r4, #56]
+
+ fstd d8, [r4, #64]
+ fstd d9, [r4, #72]
+ fstd d10, [r4, #80]
+ fstd d11, [r4, #88]
+
+ fstd d12, [r4, #96]
+ fstd d13, [r4, #104]
+ fstd d14, [r4, #112]
+ fstd d15, [r4, #120]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, %function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ fstd d0, [r4, #0]
+ fstd d1, [r4, #8]
+ fstd d2, [r4, #16]
+ fstd d3, [r4, #24]
+
+// fstd d4, [r4, #32]
+ fstd d5, [r4, #40]
+ fstd d6, [r4, #48]
+ fstd d7, [r4, #56]
+
+// fstd d8, [r4, #64]
+// fstd d9, [r4, #72]
+ fstd d10, [r4, #80]
+ fstd d11, [r4, #88]
+
+// fstd d12, [r4, #96]
+// fstd d13, [r4, #104]
+// fstd d14, [r4, #112]
+ fstd d15, [r4, #120]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #0] // beta
+ ldr r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16
+// void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, %function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ ldr r6, [fp, #0] // B
+ ldr r7, [fp, #4] // sdb
+ lsl r7, r7, #5 // 4*sizeof(double)*sdb
+ mov r8, r3 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #8] // beta
+ ldr r6, [fp, #12] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #16] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, %function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #0] // beta
+ ldr r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, %function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kmax
+ mov r5, r1 // A
+ mov r6, r2 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ mov r4, r3 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #4] // E
+ ldr r5, [fp, #8] // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #0] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4
+// void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, %function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kmax
+ mov r5, r1 // A
+ mov r6, r2 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ mov r4, r3 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #4] // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #0] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, %function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt add
+ mov r4, r0 // kp
+ mov r5, r1 // Ap
+ mov r6, r2 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner kernel dsyrk l nt sub
+ mov r4, r3 // kmax
+ ldr r5, [fp, #0] // Am
+ ldr r6, [fp, #4] // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ ldr r4, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #16] // E
+ ldr r5, [fp, #20] // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, %function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kp
+ mov r5, r1 // Ap
+ mov r6, r2 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner kernel dsyrk l nt sub
+ mov r4, r3 // kmax
+ ldr r5, [fp, #0] // Am
+ ldr r6, [fp, #4] // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ ldr r4, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #16] // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_12x4_lib4.S b/kernel/armv7a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..96ff7a4
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,589 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- sda
+// r7 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ add r8, r5, r6 // A1
+ add r9, r8, r6 // A2
+
+ // prefetch
+ pld [r5, #0] // A0
+ pld [r7, #0] // B
+ pld [r8, #0] // A1
+ pld [r9, #0] // A2
+
+ // preload
+ vld1.64 {d0, d1}, [r7:128] // B
+ vld1.64 {d2, d3}, [r5:128] // A0
+ vld1.64 {d4, d5}, [r8:128] // A1
+// vld1.64 {d6, d7}, [r9:128] // A2
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ pld [r5, #64] // A0
+ pld [r7, #64] // B
+ pld [r8, #64] // A1
+ pld [r9, #64] // A2
+
+ // main loop
+1:
+
+ // unroll 0
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #0] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #8] // A2
+ vmla.f32 q6, q1, d1[0]
+ pld [r7, #128]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #16] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #24] // A0
+ vmla.f32 q9, q2, d0[1]
+ pld [r5, #128]
+ vmla.f32 q10, q2, d1[0]
+ pld [r8, #128]
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #16] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #24] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #16] // A1
+ vmla.f32 q14, q3, d1[0]
+ pld [r9, #128]
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #24] // A1
+
+ // unroll 1
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #16] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #24] // A2
+ vmla.f32 q6, q1, d5[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q1, d5[1]
+ vldr d2, [r5, #32] // A0
+ vmla.f32 q8, q0, d4[0]
+ vldr d3, [r5, #40] // A0
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+ vldr d0, [r7, #32] // B
+ vmla.f32 q12, q3, d4[0]
+ vldr d1, [r7, #40] // B
+ vmla.f32 q13, q3, d4[1]
+ vldr d4, [r8, #32] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+ vldr d5, [r8, #40] // A1
+
+ // unroll 2
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #32] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #40] // A2
+ vmla.f32 q6, q1, d1[0]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #48] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #56] // A0
+ vmla.f32 q9, q2, d0[1]
+ vmla.f32 q10, q2, d1[0]
+ add r5, r5, #64
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #48] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #56] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #48] // A1
+ vmla.f32 q14, q3, d1[0]
+ add r7, r7, #64
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #56] // A1
+
+ // unroll 3
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #48] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #56] // A2
+ vmla.f32 q6, q1, d5[0]
+ add r8, r8, #64
+ vmla.f32 q7, q1, d5[1]
+ vldr d2, [r5, #0] // A0
+ vmla.f32 q8, q0, d4[0]
+ vldr d3, [r5, #8] // A0
+ vmla.f32 q9, q0, d4[1]
+ add r9, r9, #64
+ vmla.f32 q10, q0, d5[0]
+ cmp r4, #4
+ vmla.f32 q11, q0, d5[1]
+ vldr d0, [r7, #0] // B
+ vmla.f32 q12, q3, d4[0]
+ vldr d1, [r7, #8] // B
+ vmla.f32 q13, q3, d4[1]
+ vldr d4, [r8, #0] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+ vldr d5, [r8, #8] // A1
+
+
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+
+ // unroll 0
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #0] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #8] // A2
+ vmla.f32 q6, q1, d1[0]
+ pld [r7, #64]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #16] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #24] // A0
+ vmla.f32 q9, q2, d0[1]
+ pld [r5, #64]
+ vmla.f32 q10, q2, d1[0]
+ pld [r8, #64]
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #16] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #24] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #16] // A1
+ vmla.f32 q14, q3, d1[0]
+ pld [r9, #64]
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #24] // A1
+
+ // unroll 1
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #16] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #24] // A2
+ vmla.f32 q6, q1, d5[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q1, d5[1]
+ vldr d2, [r5, #32] // A0
+ vmla.f32 q8, q0, d4[0]
+ vldr d3, [r5, #40] // A0
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+ vldr d0, [r7, #32] // B
+ vmla.f32 q12, q3, d4[0]
+ vldr d1, [r7, #40] // B
+ vmla.f32 q13, q3, d4[1]
+ vldr d4, [r8, #32] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+ vldr d5, [r8, #40] // A1
+
+ // unroll 2
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #32] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #40] // A2
+ vmla.f32 q6, q1, d1[0]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #48] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #56] // A0
+ vmla.f32 q9, q2, d0[1]
+ vmla.f32 q10, q2, d1[0]
+ add r5, r5, #64
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #48] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #56] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #48] // A1
+ vmla.f32 q14, q3, d1[0]
+ add r7, r7, #64
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #56] // A1
+
+ // unroll 3
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #48] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #56] // A2
+ vmla.f32 q6, q1, d5[0]
+ add r9, r9, #64
+ vmla.f32 q7, q1, d5[1]
+// vldr d2, [r5, #0] // A0
+ vmla.f32 q8, q0, d4[0]
+// vldr d3, [r5, #8] // A0
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ add r8, r8, #64
+ vmla.f32 q11, q0, d5[1]
+// vldr d0, [r7, #0] // B
+ vmla.f32 q12, q3, d4[0]
+// vldr d1, [r7, #8] // B
+ vmla.f32 q13, q3, d4[1]
+// vldr d4, [r8, #0] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+// vldr d5, [r8, #8] // A1
+
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d4, d5}, [r7:128]! // B
+ vld1.64 {d0, d1}, [r5:128]! // A0
+ vmla.f32 q4, q0, d4[0]
+ sub r4, r4, #1
+ vmla.f32 q5, q0, d4[1]
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+ vld1.64 {d0, d1}, [r8:128]! // A1
+ vmla.f32 q8, q0, d4[0]
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+ vld1.64 {d0, d1}, [r8:128]! // A1
+ vmla.f32 q12, q0, d4[0]
+ vmla.f32 q13, q0, d4[1]
+ vmla.f32 q14, q0, d5[0]
+ vmla.f32 q15, q0, d5[1]
+
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+// r7 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_12x4_lib4:
+#endif
+#endif
+
+ flds s8, [r4, #0] // alpha
+
+ vmul.f32 q4, q4, d4[0]
+ flds s9, [r5, #0] // beta
+ vmul.f32 q5, q5, d4[0]
+ flds s10, .LC00 // 0.0
+ vmul.f32 q6, q6, d4[0]
+ vmul.f32 q7, q7, d4[0]
+ fcmpes s9, s10
+ vmul.f32 q8, q8, d4[0]
+ vmul.f32 q9, q9, d4[0]
+ vmul.f32 q10, q10, d4[0]
+ vmul.f32 q11, q11, d4[0]
+ vmul.f32 q12, q12, d4[0]
+ vmul.f32 q13, q13, d4[0]
+ vmul.f32 q14, q14, d4[0]
+ vmul.f32 q15, q15, d4[0]
+ fmstat
+
+ beq 0f // end
+
+ add r8, r6, r7
+ add r9, r8, r7
+
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q4, q0, d4[1]
+ vmla.f32 q5, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q6, q0, d4[1]
+ vmla.f32 q7, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q8, q0, d4[1]
+ vmla.f32 q9, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q10, q0, d4[1]
+ vmla.f32 q11, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r9:128]!
+ vmla.f32 q12, q0, d4[1]
+ vmla.f32 q13, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r9:128]!
+ vmla.f32 q14, q0, d4[1]
+ vmla.f32 q15, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+// r5 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_12X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_lib4:
+#endif
+#endif
+
+ add r6, r4, r5
+ add r7, r6, r5
+
+ vst1.64 {d8, d9, d10, d11}, [r4:128]!
+ vst1.64 {d12, d13, d14, d15}, [r4:128]!
+ vst1.64 {d16, d17, d18, d19}, [r6:128]!
+ vst1.64 {d20, d21, d22, d23}, [r6:128]!
+ vst1.64 {d24, d25, d26, d27}, [r7:128]!
+ vst1.64 {d28, d29, d30, d31}, [r7:128]!
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nt_12x4_lib4
+ .type kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nt_12x4_lib4
+_kernel_sgemm_nt_12x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+ vmov q8, q4
+ vmov q9, q4
+ vmov q10, q4
+ vmov q11, q4
+ vmov q12, q4
+ vmov q13, q4
+ vmov q14, q4
+ vmov q15, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // sda
+ lsl r6, r6, #4 // 4*sizeof(float)*sda
+ ldr r7, [fp, #0] // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+ ldr r7, [fp, #12] // sdc
+ lsl r7, r7, #4 // 4*sizeof(float)*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #16] // D
+ ldr r5, [fp, #20] // sdd
+ lsl r5, r5, #4 // 4*sizeof(float)*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_12x4_lib4, .-kernel_sgemm_nt_12x4_lib4
+#endif
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_4x4_lib4.S b/kernel/armv7a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..e8a2e71
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,675 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vld1.64 {d4, d5}, [r6:128]! // B
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 1
+ vmla.f32 q4, q1, d6[0]
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 3
+ vmla.f32 q4, q1, d6[0]
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ sub r4, r4, #4
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 1
+ vmla.f32 q4, q1, d6[0]
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 3
+ vmla.f32 q4, q1, d6[0]
+// vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+// vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+ sub r5, r5, #16
+ sub r6, r6, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q4, q0, d4[0]
+ vmla.f32 q5, q0, d4[1]
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- 4*sdb*sizeof(float)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_4x4_lib4, %function
+inner_kernel_gemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr d4, [r6, #0] // B[0,1]
+ vldr d5, [r6, #16] // B[4,5]
+ vldr d6, [r6, #32] // B[8,9]
+ vldr d7, [r6, #48] // B[12,13]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, r7]
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+// vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+// vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+// vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+// vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+// vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+ sub r5, r5, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr s8, [r6, #0] // B[0]
+ vmla.f32 q4, q0, d4[0]
+ vldr s8, [r6, #16] // B[4]
+ vmla.f32 q5, q0, d4[0]
+ vldr s8, [r6, #32] // B[8]
+ vmla.f32 q6, q0, d4[0]
+ vldr s8, [r6, #48] // B[12]
+ vmla.f32 q7, q0, d4[0]
+
+ sub r4, r4, #1
+ add r6, r6, #4
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ flds s8, [r4, #0] // alpha
+
+ vmul.f32 q4, q4, d4[0]
+ flds s9, [r5, #0] // beta
+ vmul.f32 q5, q5, d4[0]
+ flds s10, .LC00 // 0.0
+ vmul.f32 q6, q6, d4[0]
+ fcmpes s9, s10
+ vmul.f32 q7, q7, d4[0]
+ fmstat
+
+ beq 0f // end
+
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q4, q0, d4[1]
+ vmla.f32 q5, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q6, q0, d4[1]
+ vmla.f32 q7, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#endif
+#endif
+
+ vst1.64 {d8, d9, d10, d11}, [r4:128]!
+ vst1.64 {d12, d13, d14, d15}, [r4:128]!
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8
+// void kernel_sgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nt_4x4_lib4
+ .type kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nt_4x4_lib4
+_kernel_sgemm_nt_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #0] // beta
+ ldr r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x4_lib4, .-kernel_sgemm_nt_4x4_lib4
+#endif
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12
+// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nn_4x4_lib4
+ .type kernel_sgemm_nn_4x4_lib4, %function
+kernel_sgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nn_4x4_lib4
+_kernel_sgemm_nn_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+ ldr r7, [fp, #0] // sdb
+ lsl r7, r7, #4 // 4*sizeof(float)*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_8x4_lib4.S b/kernel/armv7a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..f356c9b
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,795 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- sda
+// r7 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ add r8, r5, r6 // A1
+
+ // prefetch
+ pld [r5, #0]
+ pld [r7, #0]
+ pld [r8, #0]
+ pld [r7, #64]
+
+ // preload
+ vld1.64 {d0, d1}, [r7:128]! // B // TODO preload B in d0-d3 too ?????
+ vld1.64 {d2, d3}, [r7:128]! // B
+ vld1.64 {d4, d5}, [r7:128]! // B // TODO preload B in d0-d3 too ?????
+ vld1.64 {d6, d7}, [r7:128]! // B
+ vld1.64 {d24, d25}, [r5:128]! // A0
+ vld1.64 {d28, d29}, [r5:128]! // A0
+ vld1.64 {d26, d27}, [r8:128] // A1
+
+ sub r7, r7, #64
+ sub r5, r5, #32
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // unroll 0
+ pld [r5, #64] // A0
+ vmla.f32 q4, q12, d0[0]
+ vldr d30, [r8, #16] // A1
+ vmla.f32 q5, q12, d0[1]
+ vldr d31, [r8, #24] // A1
+ vmla.f32 q6, q12, d1[0]
+ pld [r7, #128] // B
+ vmla.f32 q7, q12, d1[1]
+ vldr d24, [r5, #32]
+ vmla.f32 q8, q13, d0[0]
+ vldr d25, [r5, #40]
+ vmla.f32 q9, q13, d0[1]
+ vldr d0, [r7, #64]
+ vmla.f32 q10, q13, d1[0]
+ pld [r8, #64] // A1
+ vmla.f32 q11, q13, d1[1]
+ vldr d1, [r7, #72]
+
+ // unroll 1
+ vmla.f32 q4, q14, d2[0]
+ vldr d26, [r8, #32] // A1
+ vmla.f32 q5, q14, d2[1]
+ vldr d27, [r8, #40] // A1
+ vmla.f32 q6, q14, d3[0]
+ vmla.f32 q7, q14, d3[1]
+ vldr d28, [r5, #48]
+ vmla.f32 q8, q15, d2[0]
+ vldr d29, [r5, #56]
+ vmla.f32 q9, q15, d2[1]
+ vldr d2, [r7, #80]
+ vmla.f32 q10, q15, d3[0]
+ add r5, r5, #64
+ vmla.f32 q11, q15, d3[1]
+ vldr d3, [r7, #88]
+
+ // unroll 2
+ vmla.f32 q4, q12, d4[0]
+ vldr d30, [r8, #48] // A1
+ vmla.f32 q5, q12, d4[1]
+ vldr d31, [r8, #56] // A1
+ vmla.f32 q6, q12, d5[0]
+ add r7, r7, #64
+ vmla.f32 q7, q12, d5[1]
+ vldr d24, [r5, #0]
+ vmla.f32 q8, q13, d4[0]
+ vldr d25, [r5, #8]
+ vmla.f32 q9, q13, d4[1]
+ vldr d4, [r7, #32]
+ vmla.f32 q10, q13, d5[0]
+ add r8, r8, #64
+ vmla.f32 q11, q13, d5[1]
+ vldr d5, [r7, #40]
+
+ // unroll 3
+ vmla.f32 q4, q14, d6[0]
+ vldr d26, [r8, #0] // A1
+ vmla.f32 q5, q14, d6[1]
+ vldr d27, [r8, #8] // A1
+ vmla.f32 q6, q14, d7[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q14, d7[1]
+ vldr d28, [r5, #16]
+ vmla.f32 q8, q15, d6[0]
+ vldr d29, [r5, #24]
+ vmla.f32 q9, q15, d6[1]
+ vldr d6, [r7, #48]
+ vmla.f32 q10, q15, d7[0]
+ vmla.f32 q11, q15, d7[1]
+ vldr d7, [r7, #56]
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+
+ // unroll 0
+ vmla.f32 q4, q12, d0[0]
+ vldr d30, [r8, #16] // A1
+ vmla.f32 q5, q12, d0[1]
+ vldr d31, [r8, #24] // A1
+ vmla.f32 q6, q12, d1[0]
+ vmla.f32 q7, q12, d1[1]
+ vldr d24, [r5, #32]
+ vmla.f32 q8, q13, d0[0]
+ vldr d25, [r5, #40]
+ vmla.f32 q9, q13, d0[1]
+// vldr d4, [r7, #64]
+ vmla.f32 q10, q13, d1[0]
+ vmla.f32 q11, q13, d1[1]
+// vldr d5, [r7, #72]
+
+ // unroll 1
+ vmla.f32 q4, q14, d2[0]
+ vldr d26, [r8, #32] // A1
+ vmla.f32 q5, q14, d2[1]
+ vldr d27, [r8, #40] // A1
+ vmla.f32 q6, q14, d3[0]
+ vmla.f32 q7, q14, d3[1]
+ vldr d28, [r5, #48]
+ vmla.f32 q8, q15, d2[0]
+ vldr d29, [r5, #56]
+ vmla.f32 q9, q15, d2[1]
+// vldr d6, [r7, #80]
+ vmla.f32 q10, q15, d3[0]
+// add r5, r5, #64
+ vmla.f32 q11, q15, d3[1]
+// vldr d7, [r7, #88]
+
+ // unroll 2
+ vmla.f32 q4, q12, d4[0]
+ vldr d30, [r8, #48] // A1
+ vmla.f32 q5, q12, d4[1]
+ vldr d31, [r8, #56] // A1
+ vmla.f32 q6, q12, d5[0]
+// add r7, r7, #64
+ vmla.f32 q7, q12, d5[1]
+// vldr d24, [r5, #0]
+ vmla.f32 q8, q13, d4[0]
+// vldr d25, [r5, #8]
+ vmla.f32 q9, q13, d4[1]
+// vldr d4, [r7, #32]
+ vmla.f32 q10, q13, d5[0]
+// add r8, r8, #64
+ vmla.f32 q11, q13, d5[1]
+// vldr d5, [r7, #40]
+
+ // unroll 3
+ vmla.f32 q4, q14, d6[0]
+// vldr d26, [r8, #0] // A1
+ vmla.f32 q5, q14, d6[1]
+// vldr d27, [r8, #8] // A1
+ vmla.f32 q6, q14, d7[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q14, d7[1]
+// vldr d28, [r5, #16]
+ vmla.f32 q8, q15, d6[0]
+// vldr d29, [r5, #24]
+ vmla.f32 q9, q15, d6[1]
+// vldr d6, [r7, #48]
+ vmla.f32 q10, q15, d7[0]
+ vmla.f32 q11, q15, d7[1]
+// vldr d7, [r7, #56]
+
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+// sub r5, r5, #32 // A0
+// sub r7, r7, #32 // B
+// sub r8, r8, #16 // A1
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d4, d5}, [r7:128]! // B
+ vld1.64 {d0, d1}, [r5:128]! // A0
+ vmla.f32 q4, q0, d4[0]
+ vmla.f32 q5, q0, d4[1]
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+ vld1.64 {d0, d1}, [r8:128]! // A1
+ vmla.f32 q8, q0, d4[0]
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- 4*sdb*sizeof(float)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_4x4_lib4, %function
+inner_kernel_gemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr d4, [r6, #0] // B[0,1]
+ vldr d5, [r6, #16] // B[4,5]
+ vldr d6, [r6, #32] // B[8,9]
+ vldr d7, [r6, #48] // B[12,13]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, r7]
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+// vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+// vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+// vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+// vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+// vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+ sub r5, r5, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr s8, [r6, #0] // B[0]
+ vmla.f32 q4, q0, d4[0]
+ vldr s8, [r6, #16] // B[4]
+ vmla.f32 q5, q0, d4[0]
+ vldr s8, [r6, #32] // B[8]
+ vmla.f32 q6, q0, d4[0]
+ vldr s8, [r6, #48] // B[12]
+ vmla.f32 q7, q0, d4[0]
+
+ sub r4, r4, #1
+ add r6, r6, #4
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+// r7 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+ flds s8, [r4, #0] // alpha
+
+ vmul.f32 q4, q4, d4[0]
+ flds s9, [r5, #0] // beta
+ vmul.f32 q5, q5, d4[0]
+ flds s10, .LC00 // 0.0
+ vmul.f32 q6, q6, d4[0]
+ vmul.f32 q7, q7, d4[0]
+ fcmpes s9, s10
+ vmul.f32 q8, q8, d4[0]
+ vmul.f32 q9, q9, d4[0]
+ vmul.f32 q10, q10, d4[0]
+ vmul.f32 q11, q11, d4[0]
+ fmstat
+
+ beq 0f // end
+
+ add r8, r6, r7
+
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q4, q0, d4[1]
+ vmla.f32 q5, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q6, q0, d4[1]
+ vmla.f32 q7, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q8, q0, d4[1]
+ vmla.f32 q9, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q10, q0, d4[1]
+ vmla.f32 q11, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+// r5 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#endif
+#endif
+
+ add r6, r4, r5
+
+ vst1.64 {d8, d9, d10, d11}, [r4:128]!
+ vst1.64 {d12, d13, d14, d15}, [r4:128]!
+ vst1.64 {d16, d17, d18, d19}, [r6:128]!
+ vst1.64 {d20, d21, d22, d23}, [r6:128]!
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nt_8x4_lib4
+ .type kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nt_8x4_lib4
+_kernel_sgemm_nt_8x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+ vmov q8, q4
+ vmov q9, q4
+ vmov q10, q4
+ vmov q11, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // sda
+ lsl r6, r6, #4 // 4*sizeof(float)*sda
+ ldr r7, [fp, #0] // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+ ldr r7, [fp, #12] // sdc
+ lsl r7, r7, #4 // 4*sizeof(float)*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #16] // D
+ ldr r5, [fp, #20] // sdd
+ lsl r5, r5, #4 // 4*sizeof(float)*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib4, .-kernel_sgemm_nt_8x4_lib4
+#endif
+
+
+
+#if 0
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12
+// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nn_4x4_lib4
+ .type kernel_sgemm_nn_4x4_lib4, %function
+kernel_sgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nn_4x4_lib4
+_kernel_sgemm_nn_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+ ldr r7, [fp, #0] // sdb
+ lsl r7, r7, #4 // 4*sizeof(float)*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+
diff --git a/kernel/armv8a/Makefile b/kernel/armv8a/Makefile
new file mode 100644
index 0000000..75e1faf
--- /dev/null
+++ b/kernel/armv8a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_16x4_lib4.o kernel_sgemm_12x4_lib4.o kernel_sgemm_8x8_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/armv8a/kernel_dgemm_4x4_lib4.S b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..2d43b10
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ add sp, sp, #-(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x10, #0]
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // preload
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x10, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ fmla v0.2d, v24.2d, v28.2d[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.2d, v25.2d, v28.2d[0]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.2d, v24.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x10, #64]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.2d, v26.2d, v30.2d[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.2d, v27.2d, v30.2d[0]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.2d, v26.2d, v30.2d[1]
+ fmla v3.2d, v27.2d, v30.2d[1]
+ fmla v4.2d, v26.2d, v31.2d[0]
+ fmla v5.2d, v27.2d, v31.2d[0]
+ fmla v6.2d, v26.2d, v31.2d[1]
+ fmla v7.2d, v27.2d, v31.2d[1]
+
+ // unroll 2
+ fmla v0.2d, v24.2d, v28.2d[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.2d, v25.2d, v28.2d[0]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.2d, v24.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x10, #64]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 3
+ fmla v0.2d, v26.2d, v30.2d[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.2d, v27.2d, v30.2d[0]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.2d, v26.2d, v30.2d[1]
+ fmla v3.2d, v27.2d, v30.2d[1]
+ fmla v4.2d, v26.2d, v31.2d[0]
+ fmla v5.2d, v27.2d, v31.2d[0]
+ fmla v6.2d, v26.2d, v31.2d[1]
+ fmla v7.2d, v27.2d, v31.2d[1]
+
+ cmp w8, #4
+ bgt 1b
+
+ sub x9, x9, #32
+ sub x10, x10, #32
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 1
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 2
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 3
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ sub w8, w8, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.2d, v0.2d, v28.2d[0]
+ fmul v1.2d, v1.2d, v28.2d[0]
+ fmul v2.2d, v2.2d, v28.2d[0]
+ fmul v3.2d, v3.2d, v28.2d[0]
+ fmul v4.2d, v4.2d, v28.2d[0]
+ fmul v5.2d, v5.2d, v28.2d[0]
+ fmul v6.2d, v6.2d, v28.2d[0]
+ fmul v7.2d, v7.2d, v28.2d[0]
+
+ ld1 {v28.2d}, [x9]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v26.2d, v28.2d[0]
+ fmla v3.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v4.2d, v24.2d, v28.2d[0]
+ fmla v5.2d, v25.2d, v28.2d[0]
+ fmla v6.2d, v26.2d, v28.2d[0]
+ fmla v7.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .align 4
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 x3 x4 x5 x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+ .align 4
+ .global kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov x10, x3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x4 // beta
+ mov x10, x5 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+ bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+ // store n
+ mov x8, x6
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+ bl inner_store_4x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
diff --git a/kernel/armv8a/kernel_dgemm_8x4_lib4.S b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..314489d
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,575 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+
+ // preload
+ ldp d24, d25, [x11], #16
+ ldp d26, d27, [x11], #16
+ ldp q16, q17, [x9], #32
+ ldp q20, q21, [x12], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 1
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ sub w8, w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ // unroll 2
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 3
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ cmp w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+
+ // unroll 0
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 1
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ sub w8, w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ // unroll 2
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 3
+// ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+// ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+// ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+// ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+// ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+// ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+// cmp w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x11, x11, #32
+ sub x12, x12, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v20.2d, v21.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x11], #32
+ fmla v0.2d, v20.2d, v28.2d[0]
+ fmla v1.2d, v21.2d, v28.2d[0]
+ fmla v2.2d, v20.2d, v28.2d[1]
+ fmla v3.2d, v21.2d, v28.2d[1]
+ fmla v4.2d, v20.2d, v29.2d[0]
+ fmla v5.2d, v21.2d, v29.2d[0]
+ fmla v6.2d, v20.2d, v29.2d[1]
+ fmla v7.2d, v21.2d, v29.2d[1]
+ ld1 {v22.2d, v23.2d}, [x12], #32
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ fmla v10.2d, v22.2d, v28.2d[1]
+ fmla v11.2d, v23.2d, v28.2d[1]
+ fmla v12.2d, v22.2d, v29.2d[0]
+ fmla v13.2d, v23.2d, v29.2d[0]
+ fmla v14.2d, v22.2d, v29.2d[1]
+ fmla v15.2d, v23.2d, v29.2d[1]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.2d, v0.2d, v28.2d[0]
+ fmul v1.2d, v1.2d, v28.2d[0]
+ fmul v2.2d, v2.2d, v28.2d[0]
+ fmul v3.2d, v3.2d, v28.2d[0]
+ fmul v4.2d, v4.2d, v28.2d[0]
+ fmul v5.2d, v5.2d, v28.2d[0]
+ fmul v6.2d, v6.2d, v28.2d[0]
+ fmul v7.2d, v7.2d, v28.2d[0]
+ fmul v8.2d, v8.2d, v28.2d[0]
+ fmul v9.2d, v9.2d, v28.2d[0]
+ fmul v10.2d, v10.2d, v28.2d[0]
+ fmul v11.2d, v11.2d, v28.2d[0]
+ fmul v12.2d, v12.2d, v28.2d[0]
+ fmul v13.2d, v13.2d, v28.2d[0]
+ fmul v14.2d, v14.2d, v28.2d[0]
+ fmul v15.2d, v15.2d, v28.2d[0]
+
+ ld1 {v28.2d}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v26.2d, v28.2d[0]
+ fmla v3.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v4.2d, v24.2d, v28.2d[0]
+ fmla v5.2d, v25.2d, v28.2d[0]
+ fmla v6.2d, v26.2d, v28.2d[0]
+ fmla v7.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+ fmla v8.2d, v24.2d, v28.2d[0]
+ fmla v9.2d, v25.2d, v28.2d[0]
+ fmla v10.2d, v26.2d, v28.2d[0]
+ fmla v11.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+ fmla v12.2d, v24.2d, v28.2d[0]
+ fmla v13.2d, v25.2d, v28.2d[0]
+ fmla v14.2d, v26.2d, v28.2d[0]
+ fmla v15.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .align 4
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+ st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x10], #64
+ st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_dgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, %function
+kernel_dgemm_nt_8x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #5 // 32*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #5 // 32*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+ bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #5 // 32*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+ bl inner_store_8x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_12x4_lib4.S b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..ab66cad
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+ add x13, x12, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+ prfm PLDL1KEEP, [x13, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x12], #32
+ ld1 {v16.4s, v17.4s}, [x13], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+ prfm PLDL1KEEP, [x13, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x12], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ ld1 {v23.4s}, [x12], #16
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ // unroll 1
+ ld1 {v18.4s}, [x13], #16
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ ld1 {v19.4s}, [x13], #16
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v8.4s, v17.4s, v29.4s[0]
+ fmla v9.4s, v17.4s, v29.4s[1]
+ sub w8, w8, #4
+ fmla v10.4s, v17.4s, v29.4s[2]
+ fmla v11.4s, v17.4s, v29.4s[3]
+
+ // unroll 2
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+ ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v8.4s, v18.4s, v30.4s[0]
+ fmla v9.4s, v18.4s, v30.4s[1]
+ ld1 {v21.4s}, [x12], #16
+ fmla v10.4s, v18.4s, v30.4s[2]
+ fmla v11.4s, v18.4s, v30.4s[3]
+
+ // unroll 3
+ ld1 {v16.4s}, [x13], #16
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ ld1 {v17.4s}, [x13], #16
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ cmp w8, #4
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v19.4s, v31.4s[0]
+ fmla v9.4s, v19.4s, v31.4s[1]
+ fmla v10.4s, v19.4s, v31.4s[2]
+ fmla v11.4s, v19.4s, v31.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x12], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ ld1 {v23.4s}, [x12], #16
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ // unroll 1
+ ld1 {v18.4s}, [x13], #16
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ ld1 {v19.4s}, [x13], #16
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v8.4s, v17.4s, v29.4s[0]
+ fmla v9.4s, v17.4s, v29.4s[1]
+ sub w8, w8, #4
+ fmla v10.4s, v17.4s, v29.4s[2]
+ fmla v11.4s, v17.4s, v29.4s[3]
+
+ // unroll 2
+// ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+// ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+// ld1 {v20.4s}, [x12], #16
+ fmla v8.4s, v18.4s, v30.4s[0]
+ fmla v9.4s, v18.4s, v30.4s[1]
+// ld1 {v21.4s}, [x12], #16
+ fmla v10.4s, v18.4s, v30.4s[2]
+ fmla v11.4s, v18.4s, v30.4s[3]
+
+ // unroll 3
+// ld1 {v16.4s}, [x13], #16
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+// ld1 {v17.4s}, [x13], #16
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ cmp w8, #4
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v19.4s, v31.4s[0]
+ fmla v9.4s, v19.4s, v31.4s[1]
+ fmla v10.4s, v19.4s, v31.4s[2]
+ fmla v11.4s, v19.4s, v31.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x12, x12, #32
+ sub x11, x11, #32
+ sub x13, x13, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v16.4s}, [x13], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+ add x13, x12, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_12X4_LIB4
+#else
+ .align 4
+ .type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#endif
+
+ add x10, x8, x9
+ add x11, x10, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_12x4_lib4
+ .type kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+ bl inner_scale_ab_12x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+ bl inner_store_12x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_16x4_lib4.S b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
new file mode 100644
index 0000000..edc06ac
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
@@ -0,0 +1,600 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_16x4_lib4, %function
+inner_kernel_gemm_add_nt_16x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+ add x13, x12, x10
+ add x14, x13, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+ prfm PLDL1KEEP, [x13, #0]
+ prfm PLDL1KEEP, [x14, #0]
+
+ // preload
+ ldp s24, s25, [x11], #8
+ ldp s26, s27, [x11], #8
+ ldr q16, [x9], #16
+ ldr q17, [x12], #16
+ ldr q18, [x13], #16
+ ldr q19, [x14], #16
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+ prfm PLDL1KEEP, [x13, #32]
+ prfm PLDL1KEEP, [x14, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 1
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+ prfm PLDL1KEEP, [x13, #32]
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+ prfm PLDL1KEEP, [x14, #32]
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ // unroll 2
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 3
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+ sub w8, w8, #4
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+ cmp w8, #4
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+
+ // unroll 0
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 1
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+// prfm PLDL1KEEP, [x13, #64]
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ // unroll 2
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+// prfm PLDL1KEEP, [x14, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 3
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+// sub w8, w8, #4
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+// cmp w8, #4
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #16
+ sub x11, x11, #16
+ sub x12, x12, #16
+ sub x13, x13, #16
+ sub x14, x14, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ // TODO
+ ldp s24, s25, [x11], #8
+ ldr q16, [x9], #16
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q17, [x12], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ ldr q19, [x14], #16
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_16x4_lib4, .-inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_16X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_16x4_lib4, %function
+inner_scale_ab_16x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+ fmul v12.4s, v12.4s, v28.4s[0]
+ fmul v13.4s, v13.4s, v28.4s[0]
+ fmul v14.4s, v14.4s, v28.4s[0]
+ fmul v15.4s, v15.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+ add x13, x12, x11
+ add x14, x13, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x14], #64
+ fmla v12.4s, v24.4s, v28.4s[0]
+ fmla v13.4s, v25.4s, v28.4s[0]
+ fmla v14.4s, v26.4s, v28.4s[0]
+ fmla v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_16x4_lib4, .-inner_scale_ab_16x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_16X4_LIB4
+#else
+ .align 4
+ .type inner_store_16x4_lib4, %function
+inner_store_16x4_lib4:
+#endif
+
+ add x10, x8, x9
+ add x11, x10, x9
+ add x12, x11, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x12], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_16x4_lib4, .-inner_store_16x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_16x4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_16x4_lib4
+ .type kernel_sgemm_nt_16x4_lib4, %function
+kernel_sgemm_nt_16x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB4
+#else
+ bl inner_scale_ab_16x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB4
+#else
+ bl inner_store_16x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_4x4_lib4.S b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..6d3850d
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,354 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ add sp, sp, #-(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x10, #0]
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // preload
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x10, #32]
+
+ // main loop
+1:
+
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ prfm PLDL1KEEP, [x10, #64]
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ sub w8, w8, #4
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+
+ cmp w8, #4
+ bgt 1b
+
+ sub x9, x9, #32
+ sub x10, x10, #32
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ // unroll 2
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ sub w8, w8, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v24.2d}, [x9], #16
+ ld1 {v28.2d}, [x10], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+
+ ld1 {v28.2d}, [x9]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .align 4
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 x3 x4 x5 x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+ .align 4
+ .global kernel_sgemm_nt_4x4_lib4
+ .type kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov x10, x3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x4 // beta
+ mov x10, x5 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+ bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+ // store n
+ mov x8, x6
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+ bl inner_store_4x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
diff --git a/kernel/armv8a/kernel_sgemm_8x4_lib4.S b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..016af72
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,433 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x12], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.4s, v27.4s}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.4s, v31.4s}, [x11], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ ld1 {v22.4s, v23.4s}, [x12], #32
+ fmla v3.4s, v24.4s, v28.4s[3]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v20.4s, v28.4s[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ cmp w8, #4
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+ ld1 {v20.4s, v21.4s}, [x12], #32
+ fmla v3.4s, v26.4s, v30.4s[3]
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.4s, v27.4s}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.4s, v31.4s}, [x11], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ ld1 {v22.4s, v23.4s}, [x12], #32
+ fmla v3.4s, v24.4s, v28.4s[3]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v20.4s, v28.4s[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v5.4s, v20.4s, v28.4s[1]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+// cmp w8, #4
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+// ld1 {v24.4s, v25.4s}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v28.4s, v29.4s}, [x11], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+// ld1 {v20.4s, v21.4s}, [x12], #32
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v16.4s, v17.4s}, [x13], #32
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x12, x12, #32
+ sub x11, x11, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .align 4
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_8x4_lib4
+ .type kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+ bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+ bl inner_store_8x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_8x8_lib4.S b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
new file mode 100644
index 0000000..6c8c090
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
@@ -0,0 +1,565 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x8_lib4, %function
+inner_kernel_gemm_add_nt_8x8_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x13, x9, x10
+ add x14, x11, x12
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x13, #0]
+ prfm PLDL1KEEP, [x14, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x13], #32
+ ld1 {v16.4s, v17.4s}, [x14], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x13, #32]
+ prfm PLDL1KEEP, [x14, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x13], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ ld1 {v23.4s}, [x13], #16
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ ld1 {v18.4s}, [x14], #16
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ ld1 {v19.4s}, [x14], #16
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ // unroll 1
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x13, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x14, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ sub w8, w8, #4
+ fmla v8.4s, v25.4s, v17.4s[0]
+ fmla v9.4s, v25.4s, v17.4s[1]
+ fmla v10.4s, v25.4s, v17.4s[2]
+ fmla v11.4s, v25.4s, v17.4s[3]
+ fmla v12.4s, v21.4s, v17.4s[0]
+ fmla v13.4s, v21.4s, v17.4s[1]
+ cmp w8, #4
+ fmla v14.4s, v21.4s, v17.4s[2]
+ fmla v15.4s, v21.4s, v17.4s[3]
+
+ // unroll 2
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+ ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+ ld1 {v20.4s}, [x13], #16
+ fmla v8.4s, v26.4s, v18.4s[0]
+ fmla v9.4s, v26.4s, v18.4s[1]
+ ld1 {v21.4s}, [x13], #16
+ fmla v10.4s, v26.4s, v18.4s[2]
+ fmla v11.4s, v26.4s, v18.4s[3]
+ ld1 {v16.4s}, [x14], #16
+ fmla v12.4s, v22.4s, v18.4s[0]
+ fmla v13.4s, v22.4s, v18.4s[1]
+ ld1 {v17.4s}, [x14], #16
+ fmla v14.4s, v22.4s, v18.4s[2]
+ fmla v15.4s, v22.4s, v18.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v27.4s, v19.4s[0]
+ fmla v9.4s, v27.4s, v19.4s[1]
+ fmla v10.4s, v27.4s, v19.4s[2]
+ fmla v11.4s, v27.4s, v19.4s[3]
+ fmla v12.4s, v23.4s, v19.4s[0]
+ fmla v13.4s, v23.4s, v19.4s[1]
+ fmla v14.4s, v23.4s, v19.4s[2]
+ fmla v15.4s, v23.4s, v19.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x13], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ ld1 {v23.4s}, [x13], #16
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ ld1 {v18.4s}, [x14], #16
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ ld1 {v19.4s}, [x14], #16
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ // unroll 1
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x13, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x14, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ sub w8, w8, #4
+ fmla v8.4s, v25.4s, v17.4s[0]
+ fmla v9.4s, v25.4s, v17.4s[1]
+ fmla v10.4s, v25.4s, v17.4s[2]
+ fmla v11.4s, v25.4s, v17.4s[3]
+ fmla v12.4s, v21.4s, v17.4s[0]
+ fmla v13.4s, v21.4s, v17.4s[1]
+ cmp w8, #4
+ fmla v14.4s, v21.4s, v17.4s[2]
+ fmla v15.4s, v21.4s, v17.4s[3]
+
+ // unroll 2
+// ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+// ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+// ld1 {v20.4s}, [x13], #16
+ fmla v8.4s, v26.4s, v18.4s[0]
+ fmla v9.4s, v26.4s, v18.4s[1]
+// ld1 {v21.4s}, [x13], #16
+ fmla v10.4s, v26.4s, v18.4s[2]
+ fmla v11.4s, v26.4s, v18.4s[3]
+// ld1 {v16.4s}, [x14], #16
+ fmla v12.4s, v22.4s, v18.4s[0]
+ fmla v13.4s, v22.4s, v18.4s[1]
+// ld1 {v17.4s}, [x14], #16
+ fmla v14.4s, v22.4s, v18.4s[2]
+ fmla v15.4s, v22.4s, v18.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v27.4s, v19.4s[0]
+ fmla v9.4s, v27.4s, v19.4s[1]
+ fmla v10.4s, v27.4s, v19.4s[2]
+ fmla v11.4s, v27.4s, v19.4s[3]
+ fmla v12.4s, v23.4s, v19.4s[0]
+ fmla v13.4s, v23.4s, v19.4s[1]
+ fmla v14.4s, v23.4s, v19.4s[2]
+ fmla v15.4s, v23.4s, v19.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x13, x13, #32
+ sub x11, x11, #32
+ sub x14, x14, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x13], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v16.4s}, [x14], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X8_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x8_lib4, %function
+inner_scale_ab_8x8_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+ fmul v12.4s, v12.4s, v28.4s[0]
+ fmul v13.4s, v13.4s, v28.4s[0]
+ fmul v14.4s, v14.4s, v28.4s[0]
+ fmul v15.4s, v15.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v12.4s, v24.4s, v28.4s[0]
+ fmla v13.4s, v25.4s, v28.4s[0]
+ fmla v14.4s, v26.4s, v28.4s[0]
+ fmla v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X8_LIB4
+#else
+ .align 4
+ .type inner_store_8x8_lib4, %function
+inner_store_8x8_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x8_lib4, .-inner_store_8x8_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 w5 x6 x7 sp+0 sp+8 sp+16
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_8x8_lib4
+ .type kernel_sgemm_nt_8x8_lib4, %function
+kernel_sgemm_nt_8x8_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+ mov w12, w5 // sdb
+ lsl w12, w12, #4 // 16*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x6 // beta
+ mov x10, x7 // C
+ ldr w11, [sp, #(STACKSIZE + 0)] // D
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+ bl inner_scale_ab_8x8_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 8)] // D
+ ldr w9, [sp, #(STACKSIZE + 16)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB4
+#else
+ bl inner_store_8x8_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
+
diff --git a/kernel/avx/Makefile b/kernel/avx/Makefile
new file mode 100644
index 0000000..f260086
--- /dev/null
+++ b/kernel/avx/Makefile
@@ -0,0 +1,54 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_12_lib4.o kernel_dgemv_8_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o kernel_dgebp_lib4.o
+OBJS += kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgetr_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/avx/kernel_dgebp_lib4.S b/kernel/avx/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..0e8581e
--- /dev/null
+++ b/kernel/avx/kernel_dgebp_lib4.S
@@ -0,0 +1,935 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_lib4
+ .type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_lib4
+ .def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC01(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmaskmovpd 0(%r11, %r12, 1), %ymm15, %ymm4
+ vmaskmovpd 32(%r11, %r12, 1), %ymm15, %ymm5
+ vmaskmovpd 64(%r11, %r12, 1), %ymm15, %ymm6
+ vmaskmovpd 96(%r11, %r12, 1), %ymm15, %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_lib4
+ .type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_lib4
+ .def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC00(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmaskmovpd 0(%r11), %ymm15, %ymm0
+ vmaskmovpd 32(%r11), %ymm15, %ymm1
+ vmaskmovpd 64(%r11), %ymm15, %ymm2
+ vmaskmovpd 96(%r11), %ymm15, %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+ .align 5
+#endif
+ .double 0.5
+ .double 1.5
+ .double 2.5
+ .double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double 4.5
+ .double 5.5
+ .double 6.5
+ .double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 8.5
+ .double 9.5
+ .double 10.5
+ .double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_dgemm_4x4_lib4.S b/kernel/avx/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..95ff6ea
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9906 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+
+// cmpl $3, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ subl $1, %r10d
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ cmpl $4, %r10d
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+// cmpl $3, %r10d
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 48(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 56(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 32(%r12)
+
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 88(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 64(%r12)
+
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -24(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r14d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r14d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r14d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r11
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender_loader for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_lib4, @function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+ vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vmovddup %xmm0, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vmovddup %xmm1, %xmm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vmovddup %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vmovddup %xmm13, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vmovddup %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r12d
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ jl 0f // end
+ cmpl $3, %r12d
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ jl 0f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vblendpd $0x4, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x2, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+ movq ARG9, %r12 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_4_lib4
+ .type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_4_lib4
+ .def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm0
+ //
+ vmovapd 32(%r11), %ymm12
+ vaddpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ //
+ vmovapd 64(%r11), %ymm12
+ vaddpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ //
+ vmovapd 96(%r11), %ymm12
+ vaddpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 96(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm12
+ vmovapd %ymm12, 0(%r12)
+ //
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm1, %ymm12
+ vmovapd %ymm12, 32(%r12)
+ //
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm2, %ymm12
+ vmovapd %ymm12, 64(%r12)
+ //
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 104(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 112(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm3, %ymm12
+ vmovapd %ymm12, 96(%r12)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_8x4_lib4.S b/kernel/avx/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..e9f1f34
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,13154 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+// movq %r11, %r15 // A1 <- A0
+// addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 32(%r15), %ymm11 // A1[4]
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 64(%r15), %ymm9 // A1[8]
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 96(%r15), %ymm11 // A1[12]
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $128, %r11
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 32(%r15), %ymm11 // A1[4]
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 64(%r15), %ymm9 // A1[8]
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 96(%r15), %ymm11 // A1[12]
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+// addq $128, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 0(%r15), %ymm9 // A1[0]
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+// cmpl $3, %r10d
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// addq $32, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ subl $1, %r10d
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ cmpl $4, %r10d
+
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+// cmpl $3, %r10d
+
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ addq $32, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ subl $1, %r10d
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 48(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 56(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 88(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -24(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 96(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 104(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm3, %ymm14, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm4, %ymm14, %ymm4
+ vmovapd 32(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm5, %ymm14, %ymm5
+ vmovapd 64(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm6, %ymm14, %ymm6
+ vmovapd 96(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm7, %ymm14, %ymm7
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm0, %ymm1, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm2, %ymm3, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm12, %ymm14, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm13, %ymm15, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm4, %ymm5, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm6, %ymm7, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm12, %ymm14, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm13, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm3, %ymm14, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm4, %ymm14, %ymm4
+ vmovapd 32(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm5, %ymm14, %ymm5
+ vmovapd 64(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm6, %ymm14, %ymm6
+ vmovapd 96(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm7, %ymm14, %ymm7
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ // alg==1
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_lib4, @function
+inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_lib4, .-inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization vs
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+ // solve top-left
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 32(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ addq $128, %r10
+
+
+ // solve top-left
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10, %r11, 1), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 32(%r10, %r11, 1), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r10, %r11, 1), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+
+ // bottom-right
+
+ vmovapd 224(%r10, %r11, 1), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r10, %r11, 1), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r10, %r11, 1), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ // bottom-right
+
+ cmpl $7, %r13d
+ jle 0f
+
+ vmovapd 224(%r10, %r11, 1), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+0:
+ cmpl $6, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r10, %r11, 1), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+1:
+ cmpl $5, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r10, %r11, 1), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+2:
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+// vpermpd $0x00, %ymm1, %ymm13
+ vmovddup %xmm1, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+// vpermpd $0x00, %ymm2, %ymm13
+ vmovddup %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+// vpermpd $0x55, %ymm2, %ymm13
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+// vpermpd $0xaa, %ymm2, %ymm13
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+// vpermpd $0x00, %ymm3, %ymm13
+ vmovddup %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+// vpermpd $0x55, %ymm3, %ymm13
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+// vpermpd $0xaa, %ymm3, %ymm13
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+// vpermpd $0xff, %ymm3, %ymm13
+// vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0,0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ jl 4f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ jl 4f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm13
+#endif
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x4, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x2, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ // epilogue
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_8_lib4
+ .type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_8_lib4
+ .def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm0
+ vaddpd %ymm14, %ymm4, %ymm4
+ //
+ vmovapd 32(%r11), %ymm12
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm1, %ymm1
+ vaddpd %ymm14, %ymm5, %ymm5
+ vbroadcastsd 32(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ //
+ vmovapd 64(%r11), %ymm12
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm2, %ymm2
+ vaddpd %ymm14, %ymm6, %ymm6
+ vbroadcastsd 64(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 72(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ //
+ vmovapd 96(%r11), %ymm12
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm3, %ymm3
+ vaddpd %ymm14, %ymm7, %ymm7
+ vbroadcastsd 96(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 104(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 112(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 104(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 112(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_diag_lib4.c b/kernel/avx/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..d64f977
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,866 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+ b_33 = _mm256_broadcast_sd( &B[3] );
+ b_33 = _mm256_mul_pd( b_33, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+ _mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+ b_33 = _mm256_broadcast_sd( &B[3] );
+ b_33 = _mm256_mul_pd( b_33, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+ _mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22,
+ c_00,
+ d_00, d_01, d_02;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11,
+ c_00,
+ d_00, d_01;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix, beta=0.0 case
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0,
+ sign,
+ a_00,
+ b_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256i
+ mask;
+
+ __m256d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ mask = _mm256_set_epi64x( 1, -1, -1, -1 );
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_maskstore_pd( &D[0], mask, d_00 );
+ _mm256_maskstore_pd( &D[4], mask, d_01 );
+ _mm256_maskstore_pd( &D[8], mask, d_02 );
+ _mm256_maskstore_pd( &D[12], mask, d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_maskstore_pd( &D[0], mask, d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m128d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm_loaddup_pd( alpha );
+ beta0 = _mm_loaddup_pd( beta );
+
+ a_00 = _mm_load_pd( &A[0] );
+ a_00 = _mm_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm_load_pd( &B[0] );
+ d_00 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[4] );
+ d_01 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[8] );
+ d_02 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[12] );
+ d_03 = _mm_mul_pd( a_00, b_00 );
+
+ c_00 = _mm_load_pd( &C[0] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_00 = _mm_add_pd( c_00, d_00 );
+ c_00 = _mm_load_pd( &C[4] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_01 = _mm_add_pd( c_00, d_01 );
+ c_00 = _mm_load_pd( &C[8] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_02 = _mm_add_pd( c_00, d_02 );
+ c_00 = _mm_load_pd( &C[12] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_03 = _mm_add_pd( c_00, d_03 );
+
+ _mm_store_pd( &D[0], d_00 );
+ _mm_store_pd( &D[4], d_01 );
+ _mm_store_pd( &D[8], d_02 );
+ _mm_store_pd( &D[12], d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm_load_pd( &B[0] );
+ d_00 = _mm_mul_pd( a_00, b_00 );
+
+ c_00 = _mm_load_pd( &C[0] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_00 = _mm_add_pd( c_00, d_00 );
+
+ _mm_store_pd( &D[0], d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+
+ }
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = A[0] * alpha0;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
diff --git a/kernel/avx/kernel_dgemv_12_lib4.S b/kernel/avx/kernel_dgemv_12_lib4.S
new file mode 100644
index 0000000..c51ad9a
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_12_lib4.S
@@ -0,0 +1,1322 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_12_lib4, @function
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r14 // A1 <- A0
+ addq %r12, %r14 // A1 <- A0 + 4*sda*sizeof(double)
+ movq %r14, %r15 // A2 <- A1
+ addq %r12, %r15 // A2 <- A1 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 0(%r15) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+ prefetcht0 64(%r15) // software prefetch
+
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 128(%r14) // software prefetch
+ prefetcht0 128(%r15) // software prefetch
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmovapd 32(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 192(%r14) // software prefetch
+ prefetcht0 192(%r15) // software prefetch
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13 // x+4
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11 // A0+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmovapd 96(%r14), %ymm8
+ addq $128, %r14 // A1+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15 // A2+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ addq $32, %r11
+ addq $32, %r14
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_12_lib4, .-inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_12_lib4, @function
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 256(%r11) // software prefetch
+ prefetcht0 320(%r11) // software prefetch
+
+ jl 0f // clean-up loop
+
+ movq %r11, %r14
+ addq %r12, %r14 // A+bs*sda
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+
+ vmovupd 0(%r13), %ymm12
+ addq $32, %r13 // x+4
+
+ vmovapd 0(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ prefetcht0 64(%r14) // software prefetch
+
+ vmovapd 64(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 128(%r14) // software prefetch
+
+ vmovapd 128(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r14) // software prefetch
+
+ vmovapd 192(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ prefetcht0 256(%r14) // software prefetch
+
+ vmovapd 256(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm8, %ymm15, %ymm8
+
+ vmovapd 288(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm9, %ymm15, %ymm9
+
+ prefetcht0 320(%r14) // software prefetch
+
+ vmovapd 320(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm10, %ymm15, %ymm10
+
+ vmovapd 352(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm11, %ymm15, %ymm11
+
+// addq %r12, %r11 // A+bs*sda
+ movq %r14, %r11 // A+bs*sda
+ addq %r12, %r14 // A+bs*sda+bs*sda
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 128(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vmovapd 192(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmovapd 256(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm8, %ymm15, %ymm8
+
+ vmovapd 288(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm9, %ymm15, %ymm9
+
+ vmovapd 320(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm10, %ymm15, %ymm10
+
+ vmovapd 352(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm11, %ymm15, %ymm11
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_12_lib4, .-inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_12_lib4, @function
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm1, %ymm4, %ymm1
+ vaddpd %ymm2, %ymm5, %ymm2
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovupd 64(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_12_lib4, .-inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_12_lib4, @function
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm9, %ymm8, %ymm8
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vhaddpd %ymm11, %ymm10, %ymm10
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x2, %ymm8, %ymm10, %ymm9
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vperm2f128 $0x13, %ymm8, %ymm10, %ymm8
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+ vaddpd %ymm8, %ymm9, %ymm2
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovupd 64(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_12_lib4, .-inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_12_lib4, @function
+inner_blender_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm1, %ymm4, %ymm1
+ vaddpd %ymm2, %ymm5, %ymm2
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vsubpd %ymm2, %ymm15, %ymm2
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_12_lib4, .-inner_blender_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_12_lib4, @function
+inner_blender_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm9, %ymm8, %ymm8
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vhaddpd %ymm11, %ymm10, %ymm10
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x2, %ymm8, %ymm10, %ymm9
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vperm2f128 $0x13, %ymm8, %ymm10, %ymm8
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+ vaddpd %ymm8, %ymm9, %ymm2
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vsubpd %ymm2, %ymm15, %ymm2
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_12_lib4, .-inner_blender_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12_lib4, @function
+inner_store_12_lib4:
+#elif defined(OS_MAC)
+_inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+ vmovupd %ymm2, 64(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12_lib4, .-inner_store_12_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_12_lib4
+ .type kernel_dgemv_n_12_lib4, @function
+kernel_dgemv_n_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_12_lib4
+_kernel_dgemv_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_12_lib4
+ .def kernel_dgemv_n_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_12_lib4, .-kernel_dgemv_n_12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_12_lib4
+ .type kernel_dgemv_t_12_lib4, @function
+kernel_dgemv_t_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_12_lib4
+_kernel_dgemv_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_12_lib4
+ .def kernel_dgemv_t_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_12_lib4, .-kernel_dgemv_t_12_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_4_lib4.S b/kernel/avx/kernel_dgemv_4_lib4.S
new file mode 100644
index 0000000..656e220
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_4_lib4.S
@@ -0,0 +1,4503 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_4_lib4, @function
+inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $32, %r12
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ addq $32, %r11
+ addq $8, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_4_lib4, .-inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_4_lib4, @function
+inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmaskmovpd 0(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmaskmovpd 32(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmaskmovpd 64(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmaskmovpd 96(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_4_lib4, .-inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_4_lib4, @function
+inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm11
+
+ vmaskmovpd 0(%r13), %ymm11, %ymm12
+ vmaskmovpd 0(%r14), %ymm11, %ymm13
+
+// vmovupd %ymm14, -32(%rsp) // spill mask to stack
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 0(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 32(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 64(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 96(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd %ymm13, %ymm11, 0(%r14)
+
+ sall $3, %r10d // *sizeof(double)
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_4_lib4, .-inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemv_add_t_4_lib4, @function
+inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $3, %r15d // offA*sizeof(double)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2sd %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2sd %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm13, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+ vandpd %ymm15, %ymm14, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $4, %r10d // kmax - (4-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemv_add_t_4_lib4, .-inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovupd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_gen_lib4, @function
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#endif
+#endif
+
+ movl $4, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2sd %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm11
+
+ vmaskmovpd 0(%r13), %ymm11, %ymm12
+ vmaskmovpd 0(%r14), %ymm11, %ymm13
+
+ vmaskmovpd 0(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 32(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 64(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 96(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd %ymm13, %ymm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $3, %rax // *sizeof(double)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_gen_lib4, .-inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_lib4, @function
+inner_blend_n_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_lib4; .scl 2; .type 32; .endef
+inner_blend_n_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_lib4, .-inner_blend_n_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_lib4, @function
+inner_blend_t_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_lib4; .scl 2; .type 32; .endef
+inner_blend_t_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_lib4, .-inner_blend_t_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_4_lib4, @function
+inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_4_lib4, .-inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_m11_4_lib4, @function
+inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+ // beta
+ vmovupd 0(%r10), %ymm14
+ vsubpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_m11_4_lib4, .-inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_4_lib4, @function
+inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_4_lib4, .-inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_4_lib4, @function
+inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ vmovupd 0(%r10), %ymm14
+ vsubpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_4_lib4, .-inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_ln_inv_4_lib4, @function
+inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+ vmovapd 0(%r10), %ymm13
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vmovapd 32(%r10), %ymm13
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x3, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vmovapd 64(%r10), %ymm13
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_ln_inv_4_lib4, .-inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS, variable size version
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_ln_inv_4_vs_lib4, @function
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+ vmovapd 0(%r10), %ymm13
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+ vmovapd 32(%r10), %ymm13
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x3, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+ vmovapd 64(%r10), %ymm13
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ // return
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_ln_inv_4_vs_lib4, .-inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_4_lib4, @function
+inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vmovsd 88(%r10), %xmm11
+ vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_4_lib4, .-inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_3_lib4, @function
+inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_3_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vmovsd 88(%r10), %xmm11
+ vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+
+// vbroadcastsd 24(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0x8, %ymm12, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+0:
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_3_lib4, .-inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_2_lib4, @function
+inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_2_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ cmpl $3, %r12d
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+// vmovsd 88(%r10), %xmm11
+// vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+// vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vblendpd $0xc, %ymm14, %ymm10, %ymm10
+
+// vbroadcastsd 24(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0xc, %ymm12, %ymm0, %ymm0
+
+ je 0f
+ jl 1f
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+
+// vbroadcastsd 16(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_2_lib4, .-inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_1_lib4, @function
+inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_1_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0xe, %ymm12, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ je 0f
+
+ cmpl $2, %r12d
+ je 1f
+ jl 2f
+
+ vmovsd 24(%r10), %xmm10
+ vblendpd $0xe, %ymm14, %ymm10, %ymm10
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+
+ vmovsd 16(%r10), %xmm9
+ vblendpd $0xe, %ymm14, %ymm9, %ymm9
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+
+2:
+
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_1_lib4, .-inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- x+4*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_4_lib4, @function
+inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $32, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_4_lib4, .-inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dtrmv_ut_4_lib4, @function
+inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dtrmv_ut_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jle 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $4, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+// vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+// vmovupd LC02(%rip), %ymm13
+#endif
+// vmovddup %xmm14, %xmm14
+// vinsertf128 $1, %xmm14, %ymm14, %ymm14
+// vsubpd %ymm14, %ymm13, %ymm14
+//
+// vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovupd 0(%r13), %ymm12
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dtrmv_ut_4_lib4, .-inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_lib4, @function
+inner_store_4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_lib4; .scl 2; .type 32; .endef
+inner_store_4_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_lib4, .-inner_store_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_vs_lib4, @function
+inner_store_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_vs_lib4, .-inner_store_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_gen_lib4, @function
+inner_store_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r11d, %xmm14, %xmm14
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_gen_lib4, .-inner_store_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_lib4
+ .type kernel_dgemv_n_4_lib4, @function
+kernel_dgemv_n_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_lib4
+_kernel_dgemv_n_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_lib4
+ .def kernel_dgemv_n_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_lib4, .-kernel_dgemv_n_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_vs_lib4
+ .type kernel_dgemv_n_4_vs_lib4, @function
+kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_vs_lib4
+_kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_vs_lib4
+ .def kernel_dgemv_n_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_vs_lib4, .-kernel_dgemv_n_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemv_n_4_gen_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_gen_lib4
+ .type kernel_dgemv_n_4_gen_lib4, @function
+kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_gen_lib4
+_kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_gen_lib4
+ .def kernel_dgemv_n_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k0
+ movq ARG9, %r12 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_gen_lib4, .-kernel_dgemv_n_4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_lib4
+ .type kernel_dgemv_t_4_lib4, @function
+kernel_dgemv_t_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_lib4
+_kernel_dgemv_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_lib4
+ .def kernel_dgemv_t_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_lib4, .-kernel_dgemv_t_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_vs_lib4
+ .type kernel_dgemv_t_4_vs_lib4, @function
+kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_vs_lib4
+_kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_vs_lib4
+ .def kernel_dgemv_t_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_vs_lib4, .-kernel_dgemv_t_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_gen_lib4
+ .type kernel_dgemv_t_4_gen_lib4, @function
+kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_gen_lib4
+_kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_gen_lib4
+ .def kernel_dgemv_t_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_gen_lib4, .-kernel_dgemv_t_4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_ln_inv_4_lib4
+ .type kernel_dtrsv_ln_inv_4_lib4, @function
+kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_ln_inv_4_lib4
+_kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_ln_inv_4_lib4
+ .def kernel_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_ln_inv_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_ln_inv_4_lib4, .-kernel_dtrsv_ln_inv_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_ln_inv_4_vs_lib4
+ .type kernel_dtrsv_ln_inv_4_vs_lib4, @function
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_ln_inv_4_vs_lib4
+_kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_ln_inv_4_vs_lib4
+ .def kernel_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+ movq %r11, %r13
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_ln_inv_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+ // store vs
+
+ movq ARG6, %r10 // z
+ movq ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_ln_inv_4_vs_lib4, .-kernel_dtrsv_ln_inv_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_4_lib4
+ .type kernel_dtrsv_lt_inv_4_lib4, @function
+kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_4_lib4
+_kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_4_lib4
+ .def kernel_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_4_lib4, .-kernel_dtrsv_lt_inv_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_3_lib4
+ .type kernel_dtrsv_lt_inv_3_lib4, @function
+kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_3_lib4
+_kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_3_lib4
+ .def kernel_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_3_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_3_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $3, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_3_lib4, .-kernel_dtrsv_lt_inv_3_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_2_lib4
+ .type kernel_dtrsv_lt_inv_2_lib4, @function
+kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_2_lib4
+_kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_2_lib4
+ .def kernel_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_2_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_2_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $2, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_2_lib4, .-kernel_dtrsv_lt_inv_2_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_1_lib4
+ .type kernel_dtrsv_lt_inv_1_lib4, @function
+kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_1_lib4
+_kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_1_lib4
+ .def kernel_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_1_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_1_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $1, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_1_lib4, .-kernel_dtrsv_lt_inv_1_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_4_lib4
+ .type kernel_dtrmv_un_4_lib4, @function
+kernel_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_4_lib4
+_kernel_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_4_lib4
+ .def kernel_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG4, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_4_lib4, .-kernel_dtrmv_un_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_ut_4_lib4
+ .type kernel_dtrmv_ut_4_lib4, @function
+kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_ut_4_lib4
+_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_ut_4_lib4
+ .def kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+ // call inner blend t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_ut_4_lib4, .-kernel_dtrmv_ut_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9
+// void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *y, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_ut_4_vs_lib4
+ .type kernel_dtrmv_ut_4_vs_lib4, @function
+kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_ut_4_vs_lib4
+_kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_ut_4_vs_lib4
+ .def kernel_dtrmv_ut_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+ // call inner blend t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+ movq ARG6, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_ut_4_vs_lib4, .-kernel_dtrmv_ut_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_4_lib4
+ .type kernel_dgemv_nt_4_lib4, @function
+kernel_dgemv_nt_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_4_lib4
+_kernel_dgemv_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_4_lib4
+ .def kernel_dgemv_nt_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_4_lib4, .-kernel_dgemv_nt_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_4_vs_lib4
+ .type kernel_dgemv_nt_4_vs_lib4, @function
+kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_4_vs_lib4
+_kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_4_vs_lib4
+ .def kernel_dgemv_nt_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG12, %r11 // km
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+0:
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+ movq ARG12, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_4_vs_lib4, .-kernel_dgemv_nt_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dsymv_l_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_gen_lib4
+ .type kernel_dsymv_l_4_gen_lib4, @function
+kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_gen_lib4
+_kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_gen_lib4
+ .def kernel_dsymv_l_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_gen_lib4, .-kernel_dsymv_l_4_gen_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_8_lib4.S b/kernel/avx/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..53d371e
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1575 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 0(%r15) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 64(%r15) // software prefetch
+
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 128(%r15) // software prefetch
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 192(%r15) // software prefetch
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13 // x+4
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11 // A0+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15 // A1+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ addq $32, %r11
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 192(%r11) // software prefetch
+
+ jl 0f // clean-up loop
+
+ movq %r11, %r14
+ addq %r12, %r14 // A+bs*sda
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+
+ vmovupd 0(%r13), %ymm12
+ addq $32, %r13 // x+4
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ prefetcht0 64(%r14) // software prefetch
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 128(%r14) // software prefetch
+
+ vmovapd 128(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r14) // software prefetch
+
+ vmovapd 192(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+// addq %r12, %r11 // A+bs*sda
+ movq %r14, %r11 // A+bs*sda
+ addq %r12, %r14 // A+bs*sda+bs*sda
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ // first 4 columns
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+
+ // last 4 columns
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r15), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vbroadcastsd 24(%r13), %ymm12
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_8_lib4
+ .type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_8_lib4
+ .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_8_lib4
+ .type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_8_lib4
+ .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_8_lib4
+ .type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_8_lib4
+ .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+#if MACRO_LEVEL>=1
+ INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgeqrf_4_lib4.c b/kernel/avx/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..a5faf20
--- /dev/null
+++ b/kernel/avx/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2751 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ ii = 1;
+ if(m>1)
+ {
+ tmp = pD[1+ps*0];
+ beta += tmp*tmp;
+ if(m>2)
+ {
+ tmp = pD[2+ps*0];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*0];
+ beta += tmp*tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ ii = 1;
+ if(m>1)
+ {
+ pD[1+ps*0] *= tmp;
+ if(m>2)
+ {
+ pD[2+ps*0] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*0] *= tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*0] *= tmp;
+ pD[1+ii*sdd+ps*0] *= tmp;
+ pD[2+ii*sdd+ps*0] *= tmp;
+ pD[3+ii*sdd+ps*0] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[0+ps*1];
+ w2 = pD[0+ps*2];
+ w3 = pD[0+ps*3];
+ if(m>1)
+ {
+ w1 += pD[1+ps*1] * pD[1+ps*0];
+ w2 += pD[1+ps*2] * pD[1+ps*0];
+ w3 += pD[1+ps*3] * pD[1+ps*0];
+ if(m>2)
+ {
+ w1 += pD[2+ps*1] * pD[2+ps*0];
+ w2 += pD[2+ps*2] * pD[2+ps*0];
+ w3 += pD[2+ps*3] * pD[2+ps*0];
+ if(m>3)
+ {
+ w1 += pD[3+ps*1] * pD[3+ps*0];
+ w2 += pD[3+ps*2] * pD[3+ps*0];
+ w3 += pD[3+ps*3] * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[0+ps*1] += w1;
+ pD[0+ps*2] += w2;
+ pD[0+ps*3] += w3;
+ if(m>1)
+ {
+ pD[1+ps*1] += w1 * pD[1+ps*0];
+ pD[1+ps*2] += w2 * pD[1+ps*0];
+ pD[1+ps*3] += w3 * pD[1+ps*0];
+ if(m>2)
+ {
+ pD[2+ps*1] += w1 * pD[2+ps*0];
+ pD[2+ps*2] += w2 * pD[2+ps*0];
+ pD[2+ps*3] += w3 * pD[2+ps*0];
+ if(m>3)
+ {
+ pD[3+ps*1] += w1 * pD[3+ps*0];
+ pD[3+ps*2] += w2 * pD[3+ps*0];
+ pD[3+ps*3] += w3 * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+ }
+ if(m==1)
+ return;
+ // second column
+ beta = 0.0;
+ if(m>2)
+ {
+ tmp = pD[2+ps*1];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*1];
+ beta += tmp*tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ if(m>2)
+ {
+ pD[2+ps*1] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*1] *= tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] *= tmp;
+ pD[1+ii*sdd+ps*1] *= tmp;
+ pD[2+ii*sdd+ps*1] *= tmp;
+ pD[3+ii*sdd+ps*1] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[1+ps*2];
+ w3 = pD[1+ps*3];
+ if(m>2)
+ {
+ w2 += pD[2+ps*2] * pD[2+ps*1];
+ w3 += pD[2+ps*3] * pD[2+ps*1];
+ if(m>3)
+ {
+ w2 += pD[3+ps*2] * pD[3+ps*1];
+ w3 += pD[3+ps*3] * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[1+ps*2] += w2;
+ pD[1+ps*3] += w3;
+ if(m>2)
+ {
+ pD[2+ps*2] += w2 * pD[2+ps*1];
+ pD[2+ps*3] += w3 * pD[2+ps*1];
+ if(m>3)
+ {
+ pD[3+ps*2] += w2 * pD[3+ps*1];
+ pD[3+ps*3] += w3 * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+ }
+ if(m==2)
+ return;
+ // third column
+ beta = 0.0;
+ if(m>3)
+ {
+ tmp = pD[3+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ if(m>3)
+ {
+ pD[3+ps*2] *= tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] *= tmp;
+ pD[1+ii*sdd+ps*2] *= tmp;
+ pD[2+ii*sdd+ps*2] *= tmp;
+ pD[3+ii*sdd+ps*2] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[2+ps*3];
+ if(m>3)
+ {
+ w3 += pD[3+ps*3] * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ w3 = - dD[2] * w3;
+ pD[2+ps*3] += w3;
+ if(m>3)
+ {
+ pD[3+ps*3] += w3 * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+ }
+ if(m==3)
+ return;
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] *= tmp;
+ pD[1+ii*sdd+ps*3] *= tmp;
+ pD[2+ii*sdd+ps*3] *= tmp;
+ pD[3+ii*sdd+ps*3] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp, w0;
+ double *pC00, *pC10, *pC01, *pC11;
+ int offset;
+ double *pD0 = pD-offD;
+ for(ii=0; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ tmp = pC10[1+offset];
+ beta += tmp*tmp;
+ tmp = pC10[2+offset];
+ beta += tmp*tmp;
+ tmp = pC10[3+offset];
+ beta += tmp*tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ pC10[0+offset] *= tmp;
+ pC10[1+offset] *= tmp;
+ pC10[2+offset] *= tmp;
+ pC10[3+offset] *= tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ pC00[0] = beta;
+ }
+ if(ii<n)
+ {
+ pC01 = pC00 + ps;
+ pC11 = pC10 + ps;
+ kmax = jmax;
+ kmax0 = jmax0;
+ jmax = n-ii-1;
+ jj = 0;
+ for( ; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ps*jj] * 1.0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+ w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+ w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ps*jj] += w0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ offset = offset-ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+ pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+ pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 2;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[1+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pD[1+ps*0];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] = tmp;
+ tmp = pC[1+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pD[2+ps*0];
+ d1 = pD[2+ps*1];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] = tmp;
+ tmp = pC[2+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] = tmp;
+ if(m>3)
+ {
+ d0 = pD[3+ps*0];
+ d1 = pD[3+ps*1];
+ d2 = pD[3+ps*2];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] = tmp;
+ tmp = pC[3+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ pC[0+ps*1] -= pW[1+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+ps*0];
+ pW[0+ldw*1] = tmp;
+ if(m>2)
+ {
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+ps*1];
+ pW[0+ldw*2] = tmp;
+ if(m>3)
+ {
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+ps*2];
+ pW[0+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ }
+
+ return;
+ }
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double c00, c01,
+ c10, c11,
+ c20, c21,
+ c30, c31;
+ double a0, a1, a2, a3, b0, b1;
+ double tmp, d0, d1, d2, d3;
+ double *pC, *pW;
+ double pT[16];// = {};
+ int ldt = 4;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ __m256d
+ _w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
+
+ ii = 0;
+#if 1
+ double alpha = 1.0;
+ double beta = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<n-11; ii+=12)
+ {
+ kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+#endif
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+ if(ii<n)
+ {
+// kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
+ kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
+ }
+#else
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ _w0 = _mm256_setzero_pd();
+ _w1 = _mm256_setzero_pd();
+ _w2 = _mm256_setzero_pd();
+ _w3 = _mm256_setzero_pd();
+ for(jj=0; jj<m-3; jj+=4)
+ {
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ _d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ }
+ // TODO mask store
+ _mm256_storeu_pd( &pW[0+ps*0], _w0 );
+ _mm256_storeu_pd( &pW[0+ps*1], _w1 );
+ _mm256_storeu_pd( &pW[0+ps*2], _w2 );
+ _mm256_storeu_pd( &pW[0+ps*3], _w3 );
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ps*0] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ }
+ }
+#endif
+
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+
+ // compute W^T *= T
+ _tz = _mm256_setzero_pd();
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*0] );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _w0 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*1] );
+ _w1 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*2] );
+ _w2 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*3] );
+ _w3 = _mm256_mul_pd( _t0, _tp );
+
+#if defined(TARGET_X64_INTEL_GASWELL)
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+#else
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+#endif
+
+ _mm256_store_pd( &pW[0+ps*0], _w0 );
+ _mm256_store_pd( &pW[0+ps*1], _w1 );
+ _mm256_store_pd( &pW[0+ps*2], _w2 );
+ _mm256_store_pd( &pW[0+ps*3], _w3 );
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+
+ // compute W^T *= T
+ _tz = _mm256_setzero_pd();
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*0] );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _w0 = _mm256_mul_pd( _t0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _mm256_store_pd( &pW[0+ps*0], _w0 );
+ }
+
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ps*1];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ps*1];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ps*1];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c30 -= b0;
+ b1 = pW[3+ps*1];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[0+jj*sdc+ps*1] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[1+jj*sdc+ps*1] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[2+jj*sdc+ps*1] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ }
+ }
+ // load
+ c00 = pC[0+jj*sdc+ps*2];
+ c10 = pC[1+jj*sdc+ps*2];
+ c20 = pC[2+jj*sdc+ps*2];
+ c30 = pC[3+jj*sdc+ps*2];
+ c01 = pC[0+jj*sdc+ps*3];
+ c11 = pC[1+jj*sdc+ps*3];
+ c21 = pC[2+jj*sdc+ps*3];
+ c31 = pC[3+jj*sdc+ps*3];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*2];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ps*3];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*2];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ps*3];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*2];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ps*3];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*2];
+ c30 -= b0;
+ b1 = pW[3+ps*3];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*2] = c00;
+ pC[0+jj*sdc+ps*3] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*2] = c10;
+ pC[1+jj*sdc+ps*3] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*2] = c20;
+ pC[2+jj*sdc+ps*3] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*2] = c30;
+ pC[3+jj*sdc+ps*3] = c31;
+ }
+ }
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c30 -= b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ }
+ }
+ }
+
+#if 1
+ jj = 4;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+ }
+#endif
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
+ }
+ if(jj<m)
+ {
+ kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
+ }
+#else
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ _c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
+ _c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
+ _c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
+ _c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ // store
+ _mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ c01 = pC[ll+jj*sdc+ps*1];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[0+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[1+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[2+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[3+ps*1];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ pC[ll+jj*sdc+ps*1] = c01;
+ // load
+ c00 = pC[ll+jj*sdc+ps*2];
+ c01 = pC[ll+jj*sdc+ps*3];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[0+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[1+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[2+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[3+ps*3];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*2] = c00;
+ pC[ll+jj*sdc+ps*3] = c01;
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ }
+ }
+#endif
+
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ }
+ return;
+ }
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp;
+ double w00, w01,
+ w10, w11,
+ w20, w21,
+ w30, w31;
+ __m256d
+ _a0, _b0, _t0, _w0, _w1;
+ double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+ double pT[4];
+ int ldt = 2;
+ double *pD0 = pD-offD;
+ ii = 0;
+#if 1 // rank 2
+ for(; ii<imax-1; ii+=2)
+ {
+ // first row
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ kmax = n-ii;
+ w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ // second row
+ pC11 = pC10+ps*1;
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC11[0+ps*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ps*jj] *= tmp;
+ }
+ // compute T
+ kmax = n-ii;
+ tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+ pT[0+ldt*0] = - dD[ii+0];
+ pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldt*1] = - dD[ii+1];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-2;
+ jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+ pC20 = pC20a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+ w00 = w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ pC20 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ //
+ _w0 = _mm256_load_pd( &pC20[0+ps*0] );
+ _a0 = _mm256_load_pd( &pC20[0+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ _w1 = _mm256_load_pd( &pC20[0+ps*1] );
+ for(kk=2; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w1 = _mm256_add_pd( _w1, _t0 );
+ }
+ //
+ _b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
+ _w1 = _mm256_mul_pd( _w1, _b0 );
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _w1 = _mm256_add_pd( _w1, _t0 );
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+ _w0 = _mm256_mul_pd( _w0, _b0 );
+ //
+ _a0 = _mm256_load_pd( &pC20[0+ps*0] );
+ _a0 = _mm256_add_pd( _a0, _w0 );
+ _mm256_store_pd( &pC20[0+ps*0], _a0 );
+ _a0 = _mm256_load_pd( &pC20[0+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _a0 = _mm256_add_pd( _a0, _w1 );
+ _mm256_store_pd( &pC20[0+ps*1], _a0 );
+ for(kk=2; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w1, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _mm256_store_pd( &pC20[0+ps*kk], _a0 );
+ }
+ pC20 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+ w00 = w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ if(ii<n)
+ {
+ // compute T
+ pT[0+ldt*0] = - dD[ii+0];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ pC10 = pC10a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = w00*pT[0+ldt*0];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ pC10 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ //
+ _w0 = _mm256_load_pd( &pC10[0+ps*0] );
+ for(kk=1; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ }
+ //
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+ _w0 = _mm256_mul_pd( _w0, _b0 );
+ //
+ _a0 = _mm256_load_pd( &pC10[0+ps*0] );
+ _a0 = _mm256_add_pd( _a0, _w0 );
+ _mm256_store_pd( &pC10[0+ps*0], _a0 );
+ for(kk=1; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _mm256_store_pd( &pC10[0+ps*kk], _a0 );
+ }
+ pC10 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = w00*pT[0+ldt*0];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+ {
+ const int ps = 4;
+ int kk;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ // 0
+ // 1
+ v10 = pD[0+ps*1];
+ // 2
+ v10 += pD[1+ps*2]*pD[0+ps*2];
+ v20 = pD[0+ps*2];
+ v21 = pD[1+ps*2];
+ // 3
+ v10 += pD[1+ps*3]*pD[0+ps*3];
+ v20 += pD[2+ps*3]*pD[0+ps*3];
+ v21 += pD[2+ps*3]*pD[1+ps*3];
+ v30 = pD[0+ps*3];
+ v31 = pD[1+ps*3];
+ v32 = pD[2+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ v10 += pD[1+ps*kk]*pD[0+ps*kk];
+ v20 += pD[2+ps*kk]*pD[0+ps*kk];
+ v30 += pD[3+ps*kk]*pD[0+ps*kk];
+ v21 += pD[2+ps*kk]*pD[1+ps*kk];
+ v31 += pD[3+ps*kk]*pD[1+ps*kk];
+ v32 += pD[3+ps*kk]*pD[2+ps*kk];
+ }
+ pT[0+ps*0] = - dD[0];
+ pT[1+ps*1] = - dD[1];
+ pT[2+ps*2] = - dD[2];
+ pT[3+ps*3] = - dD[3];
+ pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+ pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+ pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+ pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+ pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+ return;
+ }
+
+
+
+// assume n>=4
+#if ! defined(TARGET_X64_INTEL_HASWELL)
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ return;
+ }
+#endif
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
+
+
+
+
diff --git a/kernel/avx/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..91d1cc0
--- /dev/null
+++ b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1434 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+
+
+ // third column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+
+
+ // fourth column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>1)
+ {
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+ }
+
+ if(n==2)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // third column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>2)
+ {
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n==3)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // fourth column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>3)
+ {
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
diff --git a/kernel/avx/kernel_dsymv_6_lib4.S b/kernel/avx/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..b55690a
--- /dev/null
+++ b/kernel/avx/kernel_dsymv_6_lib4.S
@@ -0,0 +1,1031 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 128(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm14, %ymm10, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 160(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm14, %ymm11, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+ vmaskmovpd 0(%r14), %ymm14, %ymm13
+
+ vmovupd %ymm14, -32(%rsp) // spill mask to stack
+
+// vmaskmovpd -32(%rsp), %ymm14
+ vmaskmovpd 0(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 32(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 64(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 96(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 128(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm14, %ymm10, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 160(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm14, %ymm11, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd %ymm13, %ymm14, 0(%r14)
+
+ sall $3, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm5, %ymm4, %ymm4
+// vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vextractf128 $0x1, %ymm4, %xmm5
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm4
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm4, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmovupd 32(%r12), %ymm13
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmulpd %ymm15, %ymm13, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %xmm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp_32 rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_6_lib4
+ .type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_6_lib4
+ .def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+ vbroadcastsd 32(%r10), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vbroadcastsd 40(%r10), %ymm11
+ vmulpd %ymm15, %ymm11, %ymm11
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_6_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx/kernel_sgead_lib8.S b/kernel/avx/kernel_sgead_lib8.S
new file mode 100644
index 0000000..4cafa0a
--- /dev/null
+++ b/kernel/avx/kernel_sgead_lib8.S
@@ -0,0 +1,3096 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_0_lib8, @function
+inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r12
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps 64(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_0_gen_lib8, @function
+inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r12), %ymm0
+ vmaskmovps 0(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovups 32(%r12), %ymm0
+ vmaskmovps 32(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r12
+
+ vmovups -64(%r12), %ymm0
+ vmaskmovps 64(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovups -32(%r12), %ymm0
+ vmaskmovps -32(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovups 0(%r12), %ymm0
+ vmaskmovps 0(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_1_lib8, @function
+inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+#if 1
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+#else
+ vmovups 4(%r12), %ymm0
+ vmovups -28(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovups 36(%r12), %ymm0
+ vmovups 4(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovups -60(%r12), %ymm0
+ vmovups -92(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovups -28(%r12), %ymm0
+ vmovups -60(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+#endif
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_1_lib8, .-inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_1_gen_lib8, @function
+inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_1_gen_lib8, .-inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_2_lib8, @function
+inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_2_lib8, .-inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_2_gen_lib8, @function
+inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_2_gen_lib8, .-inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_3_lib8, @function
+inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_3_lib8, .-inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_3_gen_lib8, @function
+inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_3_gen_lib8, .-inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_4_lib8, @function
+inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 48(%r12), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+
+ vmovaps -48(%r12), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %rax
+
+ vmovaps -16(%r12), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 96(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 96(%r14)
+ addq $128, %r14
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_4_lib8, .-inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_4_gen_lib8, @function
+inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 48(%r12), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+
+ vmovaps -48(%r12), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %rax
+
+ vmovaps -16(%r12), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 96(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 96(%r14)
+ addq $128, %r14
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_4_gen_lib8, .-inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_5_lib8, @function
+inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_5_lib8, .-inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_5_gen_lib8, @function
+inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_5_gen_lib8, .-inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_6_lib8, @function
+inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_6_lib8, .-inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_6_gen_lib8, @function
+inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_6_gen_lib8, .-inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_7_lib8, @function
+inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_7_lib8, .-inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_7_gen_lib8, @function
+inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_7_gen_lib8, .-inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_sgead_8_0_lib8(int k, float *alpha, float *A, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_0_lib8
+ .type kernel_sgead_8_0_lib8, @function
+kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_0_lib8
+_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_0_lib8
+ .def kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_0_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_0_lib8, .-kernel_sgead_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_0_gen_lib8
+ .type kernel_sgead_8_0_gen_lib8, @function
+kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_0_gen_lib8
+_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_0_gen_lib8
+ .def kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_0_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_0_gen_lib8, .-kernel_sgead_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_1_lib8
+ .type kernel_sgead_8_1_lib8, @function
+kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_1_lib8
+_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_1_lib8
+ .def kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_1_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_1_lib8, .-kernel_sgead_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_1_gen_lib8
+ .type kernel_sgead_8_1_gen_lib8, @function
+kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_1_gen_lib8
+_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_1_gen_lib8
+ .def kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_1_gen_lib8, .-kernel_sgead_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_2_lib8
+ .type kernel_sgead_8_2_lib8, @function
+kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_2_lib8
+_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_2_lib8
+ .def kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_2_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_2_lib8, .-kernel_sgead_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_2_gen_lib8
+ .type kernel_sgead_8_2_gen_lib8, @function
+kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_2_gen_lib8
+_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_2_gen_lib8
+ .def kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_2_gen_lib8, .-kernel_sgead_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_3_lib8
+ .type kernel_sgead_8_3_lib8, @function
+kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_3_lib8
+_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_3_lib8
+ .def kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_3_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_3_lib8, .-kernel_sgead_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_3_gen_lib8
+ .type kernel_sgead_8_3_gen_lib8, @function
+kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_3_gen_lib8
+_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_3_gen_lib8
+ .def kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_3_gen_lib8, .-kernel_sgead_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_4_lib8
+ .type kernel_sgead_8_4_lib8, @function
+kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_4_lib8
+_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_4_lib8
+ .def kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_4_lib8, .-kernel_sgead_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_4_gen_lib8
+ .type kernel_sgead_8_4_gen_lib8, @function
+kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_4_gen_lib8
+_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_4_gen_lib8
+ .def kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_4_gen_lib8, .-kernel_sgead_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_5_lib8
+ .type kernel_sgead_8_5_lib8, @function
+kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_5_lib8
+_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_5_lib8
+ .def kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_5_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_5_lib8, .-kernel_sgead_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_5_gen_lib8
+ .type kernel_sgead_8_5_gen_lib8, @function
+kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_5_gen_lib8
+_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_5_gen_lib8
+ .def kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_5_gen_lib8, .-kernel_sgead_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_6_lib8
+ .type kernel_sgead_8_6_lib8, @function
+kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_6_lib8
+_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_6_lib8
+ .def kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_6_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_6_lib8, .-kernel_sgead_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_6_gen_lib8
+ .type kernel_sgead_8_6_gen_lib8, @function
+kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_6_gen_lib8
+_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_6_gen_lib8
+ .def kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_6_gen_lib8, .-kernel_sgead_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_7_lib8
+ .type kernel_sgead_8_7_lib8, @function
+kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_7_lib8
+_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_7_lib8
+ .def kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_7_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_7_lib8, .-kernel_sgead_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_7_gen_lib8
+ .type kernel_sgead_8_7_gen_lib8, @function
+kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_7_gen_lib8
+_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_7_gen_lib8
+ .def kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_7_gen_lib8, .-kernel_sgead_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgecp_lib8.S b/kernel/avx/kernel_sgecp_lib8.S
new file mode 100644
index 0000000..5cd2c00
--- /dev/null
+++ b/kernel/avx/kernel_sgecp_lib8.S
@@ -0,0 +1,2796 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_0_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_0_lib8, @function
+inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_lib8:
+#endif
+#endif
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps %ymm0, 32(%r12)
+ addq $128, %r11
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps %ymm0, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps %ymm0, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_0_gen_lib8, @function
+inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $4, %r10d
+
+ vmovups 32(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r12)
+ addq $128, %r11
+
+ vmovups -64(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r12)
+ addq $128, %r12
+
+ vmovups -32(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovups 0(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_1_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_1_lib8, @function
+inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+#if 1
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+#else
+ vmovups 4(%r11), %ymm0
+ vmovups -28(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovups 36(%r11), %ymm0
+ vmovups 4(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovups -60(%r11), %ymm0
+ vmovups -92(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovups -28(%r11), %ymm0
+ vmovups -60(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r13)
+#endif
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_1_lib8, .-inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_1_gen_lib8, @function
+inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_1_gen_lib8, .-inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_2_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_2_lib8, @function
+inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_2_lib8, .-inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_2_gen_lib8, @function
+inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_2_gen_lib8, .-inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_3_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_3_lib8, @function
+inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_3_lib8, .-inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_3_gen_lib8, @function
+inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_3_gen_lib8, .-inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_4_lib8, @function
+inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 48(%r11), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+
+ vmovaps -48(%r11), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %rax
+
+ vmovaps -16(%r11), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 96(%r13)
+ addq $128, %r13
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_4_lib8, .-inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_4_gen_lib8, @function
+inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 48(%r11), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+
+ vmovaps -48(%r11), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %rax
+
+ vmovaps -16(%r11), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 96(%r13)
+ addq $128, %r13
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_4_gen_lib8, .-inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_5_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_5_lib8, @function
+inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_5_lib8, .-inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_5_gen_lib8, @function
+inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_5_gen_lib8, .-inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_6_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_6_lib8, @function
+inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_6_lib8, .-inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_6_gen_lib8, @function
+inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_6_gen_lib8, .-inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_7_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_7_lib8, @function
+inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_7_lib8, .-inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_7_gen_lib8, @function
+inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_7_gen_lib8, .-inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx
+// void kernel_sgecp_8_0_lib8(int k, float *A, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_0_lib8
+ .type kernel_sgecp_8_0_lib8, @function
+kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_0_lib8
+_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_0_lib8
+ .def kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_0_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_0_lib8, .-kernel_sgecp_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_0_gen_lib8
+ .type kernel_sgecp_8_0_gen_lib8, @function
+kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_0_gen_lib8
+_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_0_gen_lib8
+ .def kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_0_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_0_gen_lib8, .-kernel_sgecp_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_1_lib8
+ .type kernel_sgecp_8_1_lib8, @function
+kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_1_lib8
+_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_1_lib8
+ .def kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_1_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_1_lib8, .-kernel_sgecp_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_1_gen_lib8
+ .type kernel_sgecp_8_1_gen_lib8, @function
+kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_1_gen_lib8
+_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_1_gen_lib8
+ .def kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_1_gen_lib8, .-kernel_sgecp_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_2_lib8
+ .type kernel_sgecp_8_2_lib8, @function
+kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_2_lib8
+_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_2_lib8
+ .def kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_2_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_2_lib8, .-kernel_sgecp_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_2_gen_lib8
+ .type kernel_sgecp_8_2_gen_lib8, @function
+kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_2_gen_lib8
+_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_2_gen_lib8
+ .def kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_2_gen_lib8, .-kernel_sgecp_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_3_lib8
+ .type kernel_sgecp_8_3_lib8, @function
+kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_3_lib8
+_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_3_lib8
+ .def kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_3_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_3_lib8, .-kernel_sgecp_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_3_gen_lib8
+ .type kernel_sgecp_8_3_gen_lib8, @function
+kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_3_gen_lib8
+_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_3_gen_lib8
+ .def kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_3_gen_lib8, .-kernel_sgecp_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_4_lib8
+ .type kernel_sgecp_8_4_lib8, @function
+kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_4_lib8
+_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_4_lib8
+ .def kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_4_lib8, .-kernel_sgecp_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_4_gen_lib8
+ .type kernel_sgecp_8_4_gen_lib8, @function
+kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_4_gen_lib8
+_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_4_gen_lib8
+ .def kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_4_gen_lib8, .-kernel_sgecp_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_5_lib8
+ .type kernel_sgecp_8_5_lib8, @function
+kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_5_lib8
+_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_5_lib8
+ .def kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_5_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_5_lib8, .-kernel_sgecp_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_5_gen_lib8
+ .type kernel_sgecp_8_5_gen_lib8, @function
+kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_5_gen_lib8
+_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_5_gen_lib8
+ .def kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_5_gen_lib8, .-kernel_sgecp_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_6_lib8
+ .type kernel_sgecp_8_6_lib8, @function
+kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_6_lib8
+_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_6_lib8
+ .def kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_6_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_6_lib8, .-kernel_sgecp_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_6_gen_lib8
+ .type kernel_sgecp_8_6_gen_lib8, @function
+kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_6_gen_lib8
+_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_6_gen_lib8
+ .def kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_6_gen_lib8, .-kernel_sgecp_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_7_lib8
+ .type kernel_sgecp_8_7_lib8, @function
+kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_7_lib8
+_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_7_lib8
+ .def kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_7_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_7_lib8, .-kernel_sgecp_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_7_gen_lib8
+ .type kernel_sgecp_8_7_gen_lib8, @function
+kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_7_gen_lib8
+_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_7_gen_lib8
+ .def kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_7_gen_lib8, .-kernel_sgecp_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_16x4_lib8.S b/kernel/avx/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..5c2d6c4
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,7057 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// 8 A0
+// 9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+// vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// 8 A0
+// 9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+// vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $8, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss -4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq %r14, %r13
+
+ cmpl $8, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $7, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 124(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq %r14, %r13
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm14 // B[0]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%r13), %ymm14 // B[1]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 64(%r13), %ymm14 // B[2]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm14 // B[3]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm0, %ymm0
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm4, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm1, %ymm1
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm5, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm2, %ymm2
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm3, %ymm3
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // end-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // C1 <- C0
+ addq %r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 7f // end
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 7f // end
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 7f // end
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+ jmp 0f
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%rbx)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%rbx)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%rbx)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%rbx)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbp // D1
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdd*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x4_lib8.S b/kernel/avx/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..d319a83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,6673 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+// vbroadcastf128 128(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 32(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+// vbroadcastf128 128(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 32(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %r15d
+ subl %r14d, %r15d // 8-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r12, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r14d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r14d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movl $0, %r14d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r14d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r14d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm4
+ vmulps %ymm1, %ymm15, %ymm5
+ vmulps %ymm2, %ymm15, %ymm6
+ vmulps %ymm3, %ymm15, %ymm7
+
+ // transpose
+ vblendps $0xaa, %ymm5, %ymm4, %ymm0
+ vblendps $0xaa, %ymm5, %ymm5, %ymm1
+ vblendps $0xaa, %ymm6, %ymm7, %ymm2
+ vblendps $0xaa, %ymm7, %ymm6, %ymm3
+
+ vunpcklps %ymm1, %ymm0, %ymm4
+ vunpckhps %ymm1, %ymm0, %ymm5
+ vunpcklps %ymm3, %ymm2, %ymm6
+ vunpckhps %ymm3, %ymm2, %ymm7
+
+ vunpcklpd %ymm5, %ymm7, %ymm2
+ vunpckhpd %ymm5, %ymm7, %ymm3
+ vunpcklpd %ymm6, %ymm4, %ymm0
+ vunpckhpd %ymm6, %ymm4, %ymm1
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm0, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm1, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm2, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm3, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm4, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm5, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm6, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm7, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm4
+ vmulps %ymm1, %ymm15, %ymm5
+ vmulps %ymm2, %ymm15, %ymm6
+ vmulps %ymm3, %ymm15, %ymm7
+
+ // transpose
+ vblendps $0xaa, %ymm5, %ymm4, %ymm0
+ vblendps $0xaa, %ymm5, %ymm5, %ymm1
+ vblendps $0xaa, %ymm6, %ymm7, %ymm2
+ vblendps $0xaa, %ymm7, %ymm6, %ymm3
+
+ vunpcklps %ymm1, %ymm0, %ymm4
+ vunpckhps %ymm1, %ymm0, %ymm5
+ vunpcklps %ymm3, %ymm2, %ymm6
+ vunpckhps %ymm3, %ymm2, %ymm7
+
+ vunpcklpd %ymm5, %ymm7, %ymm2
+ vunpckhpd %ymm5, %ymm7, %ymm3
+ vunpcklpd %ymm6, %ymm4, %ymm0
+ vunpckhpd %ymm6, %ymm4, %ymm1
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm0, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm1, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm2, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm3, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm4, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm5, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm6, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm7, %xmm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %r15 // C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+
+ vmovaps %xmm0, 0(%r10)
+ vmovaps %xmm1, 32(%r10)
+ vmovaps %xmm2, 64(%r10)
+ vmovaps %xmm3, 96(%r10)
+ vmovaps %xmm4, 128(%r10)
+ vmovaps %xmm5, 160(%r10)
+ vmovaps %xmm6, 192(%r10)
+ vmovaps %xmm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm12, %ymm14
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %ymm1, %ymm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r10)
+ je 0f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm12, %xmm14
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %xmm1, %xmm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %xmm2, %xmm14, 64(%r10)
+ cmpl $4, %r12d
+ jl 0f // end
+ vmaskmovps %xmm3, %xmm14, 96(%r10)
+ cmpl $5, %r12d
+ jl 0f // end
+ vmaskmovps %xmm4, %xmm14, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %xmm5, %xmm14, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %xmm6, %xmm14, 192(%r10)
+ je 0f // end
+ vmaskmovps %xmm7, %xmm14, 224(%r10)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ vmovaps %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ addq $32, %r11
+
+ cmpl $3, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ addq $32, %r11
+
+ cmpl $4, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ addq $32, %r11
+
+ cmpl $5, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ addq $32, %r11
+
+ cmpl $6, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %xmm1, %xmm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %xmm2, %xmm15, 64(%r11)
+ cmpl $4, %r15d
+ jl 7f // end
+ vmaskmovps %xmm3, %xmm15, 96(%r11)
+ cmpl $5, %r15d
+ jl 7f // end
+ vmaskmovps %xmm4, %xmm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %xmm5, %xmm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %xmm6, %xmm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %xmm7, %xmm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vblendps $0x3, %ymm13, %ymm2, %ymm2
+ vblendps $0x7, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ //
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x8_lib8.S b/kernel/avx/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..354fa83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5514 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+// vbroadcastf128 0(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 16(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+// vbroadcastf128 0(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 16(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+ prefetcht0 128(%r14) // software prefetch
+ prefetcht0 192(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 132(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 164(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 196(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 228(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 136(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 168(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 200(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 232(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 140(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 172(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 204(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 236(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 144(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 176(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 208(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 240(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 148(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 180(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 212(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 244(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 152(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 184(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 216(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 248(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 156(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 188(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 220(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 252(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r14d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 16(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 20(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 24(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 28(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 48(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 52(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 56(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 60(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 80(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 84(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 88(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 92(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vbroadcastss 112(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 116(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 120(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 124(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 16(%r11), %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 152(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 156(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 20(%r11), %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 188(%r10), %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 24(%r11), %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %ymm13
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 28(%r11), %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm4, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 9f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+10:
+ vmovsd %xmm13, 16(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm5, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 11f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+12:
+ vmovsd %xmm13, 20(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm6, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 13f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+14:
+ vmovsd %xmm13, 24(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm7, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+16:
+ vmovsd %xmm13, 28(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm4, 128(%r10)
+ vmovaps %ymm5, 160(%r10)
+ vmovaps %ymm6, 192(%r10)
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps 32(%r10), %ymm14
+ vblendps $0x01, %ymm14, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps 64(%r10), %ymm14
+ vblendps $0x03, %ymm14, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps 96(%r10), %ymm14
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps 128(%r10), %ymm14
+ vblendps $0x0f, %ymm14, %ymm4, %ymm4
+ vmovaps %ymm4, 128(%r10)
+ vmovaps 160(%r10), %ymm14
+ vblendps $0x1f, %ymm14, %ymm5, %ymm5
+ vmovaps %ymm5, 160(%r10)
+ vmovaps 192(%r10), %ymm14
+ vblendps $0x3f, %ymm14, %ymm6, %ymm6
+ vmovaps %ymm6, 192(%r10)
+ vmovaps 224(%r10), %ymm14
+ vblendps $0x7f, %ymm14, %ymm7, %ymm7
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmovaps 128(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmovaps 160(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmovaps 192(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmovaps 224(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmovaps 128(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmovaps 160(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmovaps 192(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmovaps 224(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movl $8, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // m1
+ movq ARG9, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_diag_lib8.c b/kernel/avx/kernel_sgemm_diag_lib8.c
new file mode 100644
index 0000000..63183b2
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_diag_lib8.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+ b_33 = _mm256_broadcast_ss( &B[3] );
+ b_33 = _mm256_mul_ps( b_33, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+ _mm256_store_ps( &D[24], d_03 );
+
+ A += 8*sda;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+ _mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+ b_33 = _mm256_broadcast_ss( &B[3] );
+ b_33 = _mm256_mul_ps( b_33, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+ c_00 = _mm256_load_ps( &C[24] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_03 = _mm256_add_ps( c_00, d_03 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+ _mm256_store_ps( &D[24], d_03 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+ c_00 = _mm256_load_ps( &C[24] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_03 = _mm256_add_ps( c_00, d_03 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+ _mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22,
+ c_00,
+ d_00, d_01, d_02;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11,
+ c_00,
+ d_00, d_01;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+
+ _mm256_store_ps( &D[0], d_00 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+
+ }
+
+ }
+
+
+
+
diff --git a/kernel/avx/kernel_sgemv_4_lib8.S b/kernel/avx/kernel_sgemv_4_lib8.S
new file mode 100644
index 0000000..1508ebe
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_4_lib8.S
@@ -0,0 +1,2935 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_t_4_lib8, @function
+inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmaskmovps 0(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmaskmovps 32(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmaskmovps 64(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmaskmovps 96(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ sall $2, %r10d // *sizeof(float)
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_t_4_lib8, .-inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_nt_4_lib8, @function
+inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_nt_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_nt_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+ vmovups 0(%r14), %ymm13
+
+ vmovaps 0(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovaps 64(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovaps 96(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovups %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm11
+
+ vmaskmovps 0(%r13), %ymm11, %ymm12
+ vmaskmovps 0(%r14), %ymm11, %ymm13
+
+// vmovups %ymm14, -32(%rsp) // spill mask to stack
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 0(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 32(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 64(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 96(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps %ymm13, %ymm11, 0(%r14)
+
+ sall $2, %r10d // *sizeof(float)
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_nt_4_lib8, .-inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemv_add_t_4_lib8, @function
+inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $2, %r15d // offA*sizeof(float)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm13, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+ vandps %ymm15, %ymm14, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $8, %r10d // kmax - (8-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemv_add_t_4_lib8, .-inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ cmpl $2, %r13d
+ jl 1f
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ cmpl $3, %r13d
+ jl 2f
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ cmpl $4, %r13d
+ jl 3f
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ cmpl $5, %r13d
+ jl 4f
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ cmpl $6, %r13d
+ jl 5f
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ cmpl $7, %r13d
+ jl 6f
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ cmpl $8, %r13d
+ jl 7f
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+ jmp 0f
+
+
+
+ vmovaps %ymm14, %ymm12
+1:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+2:
+ vmovaps %ymm14, %ymm12
+3:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ jmp 8f
+
+4:
+ vmovaps %xmm14, %xmm12
+5:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+6:
+ vmovaps %xmm14, %xmm12
+7:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+8:
+
+ vmovaps %xmm14, %xmm11
+ vmovaps %xmm14, %xmm12
+ vmovaps %xmm14, %xmm13
+
+0:
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ cmpl $8, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $7, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $6, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $5, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $4, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $3, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $2, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $1, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+0:
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_symv_add_nt_4l_lib8, @function
+inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_symv_add_nt_4l_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4l_lib8:
+#endif
+#endif
+
+ movl $8, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2ss %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm11
+
+ vmaskmovps 0(%r13), %ymm11, %ymm12
+ vmaskmovps 0(%r14), %ymm11, %ymm13
+
+ vmaskmovps 0(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x01, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 32(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x01, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x03, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 64(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x03, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x07, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 96(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x07, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x0f, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps %ymm13, %ymm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $2, %rax // *sizeof(float)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_symv_add_nt_4l_lib8, .-inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+
+
+
+
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_symv_add_nt_4r_lib8, @function
+inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_symv_add_nt_4r_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4r_lib8:
+#endif
+#endif
+
+ movl $4, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2ss %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+// vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm13, %xmm11
+
+ vmaskmovps 0(%r13), %xmm11, %xmm12
+ vmaskmovps 0(%r14), %xmm11, %xmm13
+
+ vmaskmovps 0(%r11), %xmm11, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm0, %xmm15, %xmm0
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x01, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm6, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 32(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x01, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm1, %xmm15, %xmm1
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x03, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm7, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 64(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x03, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm2, %xmm15, %xmm2
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x07, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm8, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 96(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x07, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm3, %xmm15, %xmm3
+// vxorps %xmm15, %xmm15, %xmm15
+// vblendps $0x0f, %xmm15, %xmm14, %xmm14
+// vmulps %xmm14, %xmm9, %xmm15
+// vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps %xmm13, %xmm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $2, %rax // *sizeof(float)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_symv_add_nt_4r_lib8, .-inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_4_lib8, @function
+inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // alpha
+ vbroadcastss 0(%r10), %xmm15
+ vmulps %xmm0, %xmm15, %xmm0
+
+ // beta
+ vbroadcastss 0(%r11), %xmm15
+ vmovups 0(%r12), %xmm14
+ vmulps %xmm15, %xmm14, %xmm14
+ vaddps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_4_lib8, .-inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib8, @function
+inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // alpha
+ vbroadcastss 0(%r10), %xmm15
+ vmulps %xmm0, %xmm15, %xmm0
+
+ // beta
+ vmovups 0(%r11), %xmm14
+ vaddps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib8, .-inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_4_lib8, @function
+inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // beta
+ vmovups 0(%r10), %xmm14
+ vsubps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_4_lib8, .-inner_blend_t_scale_m11_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_lib8, @function
+inner_store_4_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_lib8; .scl 2; .type 32; .endef
+inner_store_4_lib8:
+#endif
+#endif
+
+ vmovups %xmm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_lib8, .-inner_store_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_vs_lib8, @function
+inner_store_4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm14
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm14
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+// vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm15, %xmm14, %xmm15
+
+ vmaskmovps %xmm0, %xmm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_vs_lib8, .-inner_store_4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_gen_lib8, @function
+inner_store_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+// vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+// vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ vmaskmovps %xmm0, %xmm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_gen_lib8, .-inner_store_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_t_4_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_lib8
+ .type kernel_sgemv_t_4_lib8, @function
+kernel_sgemv_t_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_lib8
+_kernel_sgemv_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_lib8
+ .def kernel_sgemv_t_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_lib8, .-kernel_sgemv_t_4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_t_4_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_vs_lib8
+ .type kernel_sgemv_t_4_vs_lib8, @function
+kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_vs_lib8
+_kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_vs_lib8
+ .def kernel_sgemv_t_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_vs_lib8, .-kernel_sgemv_t_4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemv_t_4_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_gen_lib8
+ .type kernel_sgemv_t_4_gen_lib8, @function
+kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_gen_lib8
+_kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_gen_lib8
+ .def kernel_sgemv_t_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_gen_lib8, .-kernel_sgemv_t_4_gen_lib8
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// 1 2 3 4 5 6 7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG8, %r12 // km
+ movq ARG9, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemv_nt_4_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_nt_4_lib8
+ .type kernel_sgemv_nt_4_lib8, @function
+kernel_sgemv_nt_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_nt_4_lib8
+_kernel_sgemv_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_nt_4_lib8
+ .def kernel_sgemv_nt_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_nt_4_lib8, .-kernel_sgemv_nt_4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemv_nt_4_vs_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_nt_4_vs_lib8
+ .type kernel_sgemv_nt_4_vs_lib8, @function
+kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_nt_4_vs_lib8
+_kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_nt_4_vs_lib8
+ .def kernel_sgemv_nt_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG12, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+ movq ARG12, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_nt_4_vs_lib8, .-kernel_sgemv_nt_4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4l_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4l_lib8
+ .type kernel_ssymv_l_4l_lib8, @function
+kernel_ssymv_l_4l_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4l_lib8
+_kernel_ssymv_l_4l_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4l_lib8
+ .def kernel_ssymv_l_4l_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+ movq $0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4l_lib8, .-kernel_ssymv_l_4l_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4r_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4r_lib8
+ .type kernel_ssymv_l_4r_lib8, @function
+kernel_ssymv_l_4r_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4r_lib8
+_kernel_ssymv_l_4r_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4r_lib8
+ .def kernel_ssymv_l_4r_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+ movq $0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4r_lib8, .-kernel_ssymv_l_4r_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dsymv_l_4l_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4l_gen_lib8
+ .type kernel_ssymv_l_4l_gen_lib8, @function
+kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4l_gen_lib8
+_kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4l_gen_lib8
+ .def kernel_ssymv_l_4l_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG8, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4l_gen_lib8, .-kernel_ssymv_l_4l_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dsymv_l_4r_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4r_gen_lib8
+ .type kernel_ssymv_l_4r_gen_lib8, @function
+kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4r_gen_lib8
+_kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4r_gen_lib8
+ .def kernel_ssymv_l_4r_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG8, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4r_gen_lib8, .-kernel_ssymv_l_4r_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .float 0.5
+ .float 1.5
+ .float 2.5
+ .float 3.5
+ .float 4.5
+ .float 5.5
+ .float 6.5
+ .float 7.5
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_sgemv_8_lib8.S b/kernel/avx/kernel_sgemv_8_lib8.S
new file mode 100644
index 0000000..aafd8cb
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_8_lib8.S
@@ -0,0 +1,2837 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_n_8_lib8, @function
+inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_n_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_n_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vbroadcastss 4(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vbroadcastss 8(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vbroadcastss 12(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $16, %r12
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ addq $32, %r11
+ addq $4, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_n_8_lib8, .-inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_t_8_lib8, @function
+inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 128(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmovaps 160(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmovaps 192(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmovaps 224(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmaskmovps 0(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmaskmovps 32(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmaskmovps 64(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmaskmovps 96(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmaskmovps 128(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmaskmovps 160(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmaskmovps 192(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmaskmovps 224(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ sall $2, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_t_8_lib8, .-inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemv_add_t_8_lib8, @function
+inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $2, %r15d // offA*sizeof(float)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm13, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+ vandps %ymm15, %ymm14, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 128(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmovaps 160(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmovaps 192(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmovaps 224(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $8, %r10d // kmax - (8-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemv_add_t_8_lib8, .-inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_ln_inv_8_lib8, @function
+inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vbroadcastss 0(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+
+ vmovaps 0(%r10), %ymm13
+ vblendps $0x01, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 4(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x02, %ymm1, %ymm0, %ymm0
+
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 8(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x04, %ymm1, %ymm0, %ymm0
+
+ vmovaps 64(%r10), %ymm13
+ vblendps $0x07, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 12(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x08, %ymm1, %ymm0, %ymm0
+
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xff, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 16(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x10, %ymm1, %ymm0, %ymm0
+
+ vmovaps 128(%r10), %ymm13
+ vblendps $0x1f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 20(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x20, %ymm1, %ymm0, %ymm0
+
+ vmovaps 160(%r10), %ymm13
+ vblendps $0x3f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 24(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x40, %ymm1, %ymm0, %ymm0
+
+ vmovaps 192(%r10), %ymm13
+ vblendps $0x7f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 28(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_ln_inv_8_lib8, .-inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_ln_inv_8_vs_lib8, @function
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vbroadcastss 0(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vmovaps 0(%r10), %ymm13
+ vblendps $0x01, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastss 4(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x02, %ymm1, %ymm0, %ymm0
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastss 8(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x04, %ymm1, %ymm0, %ymm0
+ vmovaps 64(%r10), %ymm13
+ vblendps $0x07, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastss 12(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x08, %ymm1, %ymm0, %ymm0
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xff, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $5, %r12d
+ jl 0f // ret
+
+ vbroadcastss 16(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x10, %ymm1, %ymm0, %ymm0
+ vmovaps 128(%r10), %ymm13
+ vblendps $0x1f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $6, %r12d
+ jl 0f // ret
+
+ vbroadcastss 20(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x20, %ymm1, %ymm0, %ymm0
+ vmovaps 160(%r10), %ymm13
+ vblendps $0x3f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $7, %r12d
+ jl 0f // ret
+
+ vbroadcastss 24(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x40, %ymm1, %ymm0, %ymm0
+ vmovaps 192(%r10), %ymm13
+ vblendps $0x7f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $8, %r12d
+ jl 0f // ret
+
+ vbroadcastss 28(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_ln_inv_8_vs_lib8, .-inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// r14 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// r14 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmovups 0(%r14), %ymm15
+ vblendvps %ymm14, %ymm0, %ymm15, %ymm0
+
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ cmpl $2, %r13d
+ jl 1f
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ cmpl $3, %r13d
+ jl 2f
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ cmpl $4, %r13d
+ jl 3f
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ cmpl $5, %r13d
+ jl 4f
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ cmpl $6, %r13d
+ jl 5f
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ cmpl $7, %r13d
+ jl 6f
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ cmpl $8, %r13d
+ jl 7f
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+ jmp 0f
+
+
+
+ vmovaps %ymm14, %ymm12
+1:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+2:
+ vmovaps %ymm14, %ymm12
+3:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ jmp 8f
+
+4:
+ vmovaps %xmm14, %xmm12
+5:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+6:
+ vmovaps %xmm14, %xmm12
+7:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+8:
+
+ vmovaps %xmm14, %xmm11
+ vmovaps %xmm14, %xmm12
+ vmovaps %xmm14, %xmm13
+
+0:
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ cmpl $8, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ cmpl $8, %r13d
+ jl 1f
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $7, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ cmpl $7, %r13d
+ jl 1f
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $6, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ cmpl $6, %r13d
+ jl 1f
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $5, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ cmpl $5, %r13d
+ jl 1f
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $4, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ cmpl $4, %r13d
+ jl 1f
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $3, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ cmpl $3, %r13d
+ jl 1f
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $2, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ cmpl $2, %r13d
+ jl 1f
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $1, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ cmpl $1, %r13d
+ jl 1f
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+1:
+
+0:
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib8, @function
+inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vaddps %ymm0, %ymm1, %ymm0
+ vaddps %ymm2, %ymm3, %ymm2
+ vaddps %ymm0, %ymm2, %ymm0
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+ vmulps %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+ vmovups 0(%r12), %ymm14
+ vmulps %ymm15, %ymm14, %ymm14
+ vaddps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib8, .-inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_m11_8_lib8, @function
+inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vaddps %ymm0, %ymm1, %ymm0
+ vaddps %ymm2, %ymm3, %ymm2
+ vaddps %ymm0, %ymm2, %ymm0
+
+ // beta
+ vmovups 0(%r10), %ymm14
+ vsubps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_m11_8_lib8, .-inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib8, @function
+inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+ vhaddps %ymm5, %ymm4, %ymm4
+ vhaddps %ymm7, %ymm6, %ymm6
+
+ vhaddps %ymm2, %ymm0, %ymm0
+ vhaddps %ymm6, %ymm4, %ymm4
+
+ vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
+
+ vaddps %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+ vmulps %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+ vmovups 0(%r12), %ymm14
+ vmulps %ymm15, %ymm14, %ymm14
+ vaddps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib8, .-inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_8_lib8, @function
+inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+ vhaddps %ymm5, %ymm4, %ymm4
+ vhaddps %ymm7, %ymm6, %ymm6
+
+ vhaddps %ymm2, %ymm0, %ymm0
+ vhaddps %ymm6, %ymm4, %ymm4
+
+ vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
+
+ vaddps %ymm0, %ymm1, %ymm0
+
+ // beta
+ vmovups 0(%r10), %ymm14
+ vsubps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_8_lib8, .-inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib8, @function
+inner_store_8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib8; .scl 2; .type 32; .endef
+inner_store_8_lib8:
+#endif
+#endif
+
+ vmovups %ymm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib8, .-inner_store_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_vs_lib8, @function
+inner_store_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm14
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm14, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_vs_lib8, .-inner_store_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_gen_lib8, @function
+inner_store_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_gen_lib8, .-inner_store_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_lib8
+ .type kernel_sgemv_n_8_lib8, @function
+kernel_sgemv_n_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_lib8
+_kernel_sgemv_n_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_lib8
+ .def kernel_sgemv_n_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_lib8, .-kernel_sgemv_n_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_vs_lib8
+ .type kernel_sgemv_n_8_vs_lib8, @function
+kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_vs_lib8
+_kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_vs_lib8
+ .def kernel_sgemv_n_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_vs_lib8, .-kernel_sgemv_n_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_gen_lib8
+ .type kernel_sgemv_n_8_gen_lib8, @function
+kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_gen_lib8
+_kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_gen_lib8
+ .def kernel_sgemv_n_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+ movq ARG9, %r12 // k2
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_gen_lib8, .-kernel_sgemv_n_8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_t_8_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_lib8
+ .type kernel_sgemv_t_8_lib8, @function
+kernel_sgemv_t_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_lib8
+_kernel_sgemv_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_lib8
+ .def kernel_sgemv_t_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_lib8, .-kernel_sgemv_t_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_vs_lib8
+ .type kernel_sgemv_t_8_vs_lib8, @function
+kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_vs_lib8
+_kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_vs_lib8
+ .def kernel_sgemv_t_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_vs_lib8, .-kernel_sgemv_t_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemv_t_8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_gen_lib8
+ .type kernel_sgemv_t_8_gen_lib8, @function
+kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_gen_lib8
+_kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_gen_lib8
+ .def kernel_sgemv_t_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_gen_lib8, .-kernel_sgemv_t_8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_ln_inv_8_lib8
+ .type kernel_strsv_ln_inv_8_lib8, @function
+kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_ln_inv_8_lib8
+_kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_ln_inv_8_lib8
+ .def kernel_strsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_ln_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_ln_inv_8_lib8, .-kernel_strsv_ln_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_ln_inv_8_vs_lib8
+ .type kernel_strsv_ln_inv_8_vs_lib8, @function
+kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_ln_inv_8_vs_lib8
+_kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_ln_inv_8_vs_lib8
+ .def kernel_strsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_ln_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+ movq ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_ln_inv_8_vs_lib8, .-kernel_strsv_ln_inv_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG8, %r12 // km
+ movq ARG9, %r13 // kn
+ movq ARG5, %r14 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG9, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .float 0.5
+ .float 1.5
+ .float 2.5
+ .float 3.5
+ .float 4.5
+ .float 5.5
+ .float 6.5
+ .float 7.5
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgesc_lib8.S b/kernel/avx/kernel_sgesc_lib8.S
new file mode 100644
index 0000000..43ff708
--- /dev/null
+++ b/kernel/avx/kernel_sgesc_lib8.S
@@ -0,0 +1,506 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- alpha
+// r12 <- A
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGESC_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgesc_8_lib8, @function
+inner_kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r12)
+
+ vmovaps 64(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r12)
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- alpha
+// r12 <- A
+// r13d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgesc_8_gen_lib8, @function
+inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vbroadcastss 0(%r11), %ymm14
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r12)
+
+ vmovaps 64(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r12)
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx
+// void kernel_sgesc_8_lib8(int k, float *alpha, float *A);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgesc_8_lib8
+ .type kernel_sgesc_8_lib8, @function
+kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgesc_8_lib8
+_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgesc_8_lib8
+ .def kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGESC_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgesc_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgesc_8_lib8, .-kernel_sgesc_8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_gen_lib8(int k, float *alpha, float *A, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgesc_8_gen_lib8
+ .type kernel_sgesc_8_gen_lib8, @function
+kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgesc_8_gen_lib8
+_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgesc_8_gen_lib8
+ .def kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgesc_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgesc_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgesc_8_gen_lib8, .-kernel_sgesc_8_gen_lib8
+#endif
+
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgetr_lib8.S b/kernel/avx/kernel_sgetr_lib8.S
new file mode 100644
index 0000000..745c42e
--- /dev/null
+++ b/kernel/avx/kernel_sgetr_lib8.S
@@ -0,0 +1,2476 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGETR_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgetr_8_lib8, @function
+inner_kernel_sgetr_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgetr_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $7, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ subl $8, %r10d
+ addq %r12, %r11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 0(%r13)
+ vmovaps %ymm3, 128(%r13)
+ vshufps $0xee, %ymm10, %ymm8, %ymm0
+ vshufps $0xee, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 32(%r13)
+ vmovaps %ymm3, 160(%r13)
+ vshufps $0x44, %ymm11, %ymm9, %ymm0
+ vshufps $0x44, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 64(%r13)
+ vmovaps %ymm3, 192(%r13)
+ vshufps $0xee, %ymm11, %ymm9, %ymm0
+ vshufps $0xee, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 96(%r13)
+ vmovaps %ymm3, 224(%r13)
+
+ addq $256, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ // 0
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm8
+ vmovaps %ymm8, 0(%r13)
+ cmpl $1, %r10d
+ jle 3f
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmovaps %ymm8, 32(%r13)
+ cmpl $2, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmovaps %ymm8, 64(%r13)
+ cmpl $3, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmovaps %ymm8, 96(%r13)
+ cmpl $4, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmovaps %ymm8, 128(%r13)
+ cmpl $5, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmovaps %ymm8, 160(%r13)
+ cmpl $6, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmovaps %ymm8, 192(%r13)
+// cmpl $7, %r10d
+// jle 3f
+ // 7
+// vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+// vmovaps %ymm8, 224(%r13)
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d // kleft*sizeof(float)
+ addq %r14, %r11 // A+kleft
+ movl %r10d, %r14d
+ sall $5, %r14d // kleft*bs*sizeof(float)
+ addq %r14, %r13
+ movl $0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgetr_8_lib8, .-inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgetr_8_gen_lib8, @function
+inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgetr_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $7, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ subl $8, %r10d
+ addq %r12, %r11
+
+ vmovupd -32(%rsp), %ymm4
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 0(%r13)
+ vmaskmovps %ymm3, %ymm4, 128(%r13)
+ vshufps $0xee, %ymm10, %ymm8, %ymm0
+ vshufps $0xee, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 32(%r13)
+ vmaskmovps %ymm3, %ymm4, 160(%r13)
+ vshufps $0x44, %ymm11, %ymm9, %ymm0
+ vshufps $0x44, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 64(%r13)
+ vmaskmovps %ymm3, %ymm4, 192(%r13)
+ vshufps $0xee, %ymm11, %ymm9, %ymm0
+ vshufps $0xee, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 96(%r13)
+ vmaskmovps %ymm3, %ymm4, 224(%r13)
+
+ addq $256, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ cmpl $1, %r10d
+ jle 3f
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ cmpl $2, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ cmpl $3, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ cmpl $4, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ cmpl $5, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ cmpl $6, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 192(%r13)
+// cmpl $7, %r10d
+// jle 3f
+ // 7
+// vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+// vmaskmovps %ymm8, %ymm9, 224(%r13)
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d // kleft*sizeof(float)
+ addq %r14, %r11 // A+kleft
+ movl %r10d, %r14d
+ sall $5, %r14d // kleft*bs*sizeof(float)
+ addq %r14, %r13
+ movl $0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgetr_8_gen_lib8, .-inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_0_gen_lib8, @function
+inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_0_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_0_gen_lib8, .-inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_1_gen_lib8, @function
+inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_1_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 192(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $224, %r13 // B+7*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_1_gen_lib8, .-inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_2_gen_lib8, @function
+inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_2_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $192, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_2_gen_lib8, .-inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_3_gen_lib8, @function
+inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_3_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $160, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_3_gen_lib8, .-inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_4_gen_lib8, @function
+inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $128, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_4_gen_lib8, .-inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_5_gen_lib8, @function
+inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_5_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $96, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_5_gen_lib8, .-inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_6_gen_lib8, @function
+inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_6_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $64, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_6_gen_lib8, .-inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_7_gen_lib8, @function
+inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_7_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ // 6
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $32, %r13 // B+6*bs*sizeof(float)
+
+// jmp 2f
+//
+//3:
+// movl %r10d, %r14d
+// sall $2, %r14d
+// addq %r14, %r11 // A+k*sizeof(float)
+// movl %r10d, %r14d
+// sall $5, %r14d
+// addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_7_gen_lib8, .-inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_0_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_0_lib8
+ .type kernel_sgetr_8_0_lib8, @function
+kernel_sgetr_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_0_lib8
+_kernel_sgetr_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_0_lib8
+ .def kernel_sgetr_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+ // offsetA==0: no edge
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_0_lib8, .-kernel_sgetr_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_0_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_0_gen_lib8
+ .type kernel_sgetr_8_0_gen_lib8, @function
+kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_0_gen_lib8
+_kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_0_gen_lib8
+ .def kernel_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==0: edge to compute mask
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_0_gen_lib8, .-kernel_sgetr_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_1_lib8
+ .type kernel_sgetr_8_1_lib8, @function
+kernel_sgetr_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_1_lib8
+_kernel_sgetr_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_1_lib8
+ .def kernel_sgetr_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_1_lib8, .-kernel_sgetr_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_1_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_1_gen_lib8
+ .type kernel_sgetr_8_1_gen_lib8, @function
+kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_1_gen_lib8
+_kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_1_gen_lib8
+ .def kernel_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_1_gen_lib8, .-kernel_sgetr_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_2_lib8
+ .type kernel_sgetr_8_2_lib8, @function
+kernel_sgetr_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_2_lib8
+_kernel_sgetr_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_2_lib8
+ .def kernel_sgetr_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_2_lib8, .-kernel_sgetr_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_2_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_2_gen_lib8
+ .type kernel_sgetr_8_2_gen_lib8, @function
+kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_2_gen_lib8
+_kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_2_gen_lib8
+ .def kernel_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_2_gen_lib8, .-kernel_sgetr_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_3_lib8
+ .type kernel_sgetr_8_3_lib8, @function
+kernel_sgetr_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_3_lib8
+_kernel_sgetr_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_3_lib8
+ .def kernel_sgetr_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_3_lib8, .-kernel_sgetr_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_3_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_3_gen_lib8
+ .type kernel_sgetr_8_3_gen_lib8, @function
+kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_3_gen_lib8
+_kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_3_gen_lib8
+ .def kernel_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_3_gen_lib8, .-kernel_sgetr_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_4_lib8
+ .type kernel_sgetr_8_4_lib8, @function
+kernel_sgetr_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_4_lib8
+_kernel_sgetr_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_4_lib8
+ .def kernel_sgetr_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_4_lib8, .-kernel_sgetr_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_4_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_4_gen_lib8
+ .type kernel_sgetr_8_4_gen_lib8, @function
+kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_4_gen_lib8
+_kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_4_gen_lib8
+ .def kernel_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_4_gen_lib8, .-kernel_sgetr_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_5_lib8
+ .type kernel_sgetr_8_5_lib8, @function
+kernel_sgetr_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_5_lib8
+_kernel_sgetr_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_5_lib8
+ .def kernel_sgetr_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_5_lib8, .-kernel_sgetr_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_5_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_5_gen_lib8
+ .type kernel_sgetr_8_5_gen_lib8, @function
+kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_5_gen_lib8
+_kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_5_gen_lib8
+ .def kernel_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_5_gen_lib8, .-kernel_sgetr_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_6_lib8
+ .type kernel_sgetr_8_6_lib8, @function
+kernel_sgetr_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_6_lib8
+_kernel_sgetr_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_6_lib8
+ .def kernel_sgetr_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_6_lib8, .-kernel_sgetr_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_6_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_6_gen_lib8
+ .type kernel_sgetr_8_6_gen_lib8, @function
+kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_6_gen_lib8
+_kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_6_gen_lib8
+ .def kernel_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_6_gen_lib8, .-kernel_sgetr_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_7_lib8
+ .type kernel_sgetr_8_7_lib8, @function
+kernel_sgetr_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_7_lib8
+_kernel_sgetr_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_7_lib8
+ .def kernel_sgetr_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_7_lib8, .-kernel_sgetr_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_7_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_7_gen_lib8
+ .type kernel_sgetr_8_7_gen_lib8, @function
+kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_7_gen_lib8
+_kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_7_gen_lib8
+ .def kernel_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_7_gen_lib8, .-kernel_sgetr_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/Makefile b/kernel/avx2/Makefile
new file mode 100644
index 0000000..adb91c4
--- /dev/null
+++ b/kernel/avx2/Makefile
@@ -0,0 +1,48 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_8x4_lib4.o kernel_dgemm_8x8_lib4.o kernel_dgemm_12x4_lib4.o kernel_dgemv_8_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgebp_lib4.o kernel_dgelqf_4_lib4.o
+OBJS += kernel_sgemm_24x4_lib8.o kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
diff --git a/kernel/avx2/kernel_dgebp_lib4.S b/kernel/avx2/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..4093b23
--- /dev/null
+++ b/kernel/avx2/kernel_dgebp_lib4.S
@@ -0,0 +1,2741 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_12_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_12r_lib4
+ .type kernel_dger4_sub_12r_lib4, @function
+kernel_dger4_sub_12r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_12r_lib4
+_kernel_dger4_sub_12r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_12r_lib4
+ .def kernel_dger4_sub_12r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_12r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ vmovapd 0(%r11, %r12, 2), %ymm8
+ vmovapd 32(%r11, %r12, 2), %ymm9
+ vmovapd 64(%r11, %r12, 2), %ymm10
+ vmovapd 96(%r11, %r12, 2), %ymm11
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ vmovapd 32(%r14), %ymm12
+ vmovapd 32(%r14, %r15, 1), %ymm13
+ vmovapd 32(%r14, %r15, 2), %ymm14
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 32(%r14)
+ vmovapd %ymm13, 32(%r14, %r15, 1)
+ vmovapd %ymm14, 32(%r14, %r15, 2)
+
+ vmovapd 64(%r14), %ymm12
+ vmovapd 64(%r14, %r15, 1), %ymm13
+ vmovapd 64(%r14, %r15, 2), %ymm14
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 64(%r14)
+ vmovapd %ymm13, 64(%r14, %r15, 1)
+ vmovapd %ymm14, 64(%r14, %r15, 2)
+
+ vmovapd 96(%r14), %ymm12
+ vmovapd 96(%r14, %r15, 1), %ymm13
+ vmovapd 96(%r14, %r15, 2), %ymm14
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, -32(%r14)
+ vmovapd %ymm13, -32(%r14, %r15, 1)
+ vmovapd %ymm14, -32(%r14, %r15, 2)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_12r_lib4, .-kernel_dger4_sub_12r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_12_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_12r_vs_lib4
+ .type kernel_dger4_sub_12r_vs_lib4, @function
+kernel_dger4_sub_12r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_12r_vs_lib4
+_kernel_dger4_sub_12r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_12r_vs_lib4
+ .def kernel_dger4_sub_12r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_12r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ vmaskmovpd 0(%r11, %r12, 2), %ymm15, %ymm8
+ vmaskmovpd 32(%r11, %r12, 2), %ymm15, %ymm9
+ vmaskmovpd 64(%r11, %r12, 2), %ymm15, %ymm10
+ vmaskmovpd 96(%r11, %r12, 2), %ymm15, %ymm11
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ vmovapd 32(%r14), %ymm12
+ vmovapd 32(%r14, %r15, 1), %ymm13
+ vmovapd 32(%r14, %r15, 2), %ymm14
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 32(%r14)
+ vmovapd %ymm13, 32(%r14, %r15, 1)
+ vmovapd %ymm14, 32(%r14, %r15, 2)
+
+ vmovapd 64(%r14), %ymm12
+ vmovapd 64(%r14, %r15, 1), %ymm13
+ vmovapd 64(%r14, %r15, 2), %ymm14
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 64(%r14)
+ vmovapd %ymm13, 64(%r14, %r15, 1)
+ vmovapd %ymm14, 64(%r14, %r15, 2)
+
+ vmovapd 96(%r14), %ymm12
+ vmovapd 96(%r14, %r15, 1), %ymm13
+ vmovapd 96(%r14, %r15, 2), %ymm14
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, -32(%r14)
+ vmovapd %ymm13, -32(%r14, %r15, 1)
+ vmovapd %ymm14, -32(%r14, %r15, 2)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_12r_vs_lib4, .-kernel_dger4_sub_12r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_8_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_lib4
+ .type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_lib4
+ .def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC01(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmaskmovpd 0(%r11, %r12, 1), %ymm15, %ymm4
+ vmaskmovpd 32(%r11, %r12, 1), %ymm15, %ymm5
+ vmaskmovpd 64(%r11, %r12, 1), %ymm15, %ymm6
+ vmaskmovpd 96(%r11, %r12, 1), %ymm15, %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger12_add_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger12_add_4r_lib4
+ .type kernel_dger12_add_4r_lib4, @function
+kernel_dger12_add_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger12_add_4r_lib4
+_kernel_dger12_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger12_add_4r_lib4
+ .def kernel_dger12_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger12_add_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // n
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d
+ movq ARG5, %r14 // C
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $11, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ addq $384, %r12
+ addq $384, %r14
+ subl $12, %r10d
+
+ cmpl $11, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $3, %r10d
+ jle 2f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+
+ addq $128, %r12
+ addq $128, %r14
+ subl $4, %r10d
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 1b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger12_add_4r_lib4, .-kernel_dger12_add_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger8_add_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger8_add_4r_lib4
+ .type kernel_dger8_add_4r_lib4, @function
+kernel_dger8_add_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger8_add_4r_lib4
+_kernel_dger8_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger8_add_4r_lib4
+ .def kernel_dger8_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger8_add_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // n
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d
+ movq ARG5, %r14 // C
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $11, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ addq $384, %r12
+ addq $384, %r14
+ subl $12, %r10d
+
+ cmpl $11, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $3, %r10d
+ jle 2f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+
+ addq $128, %r12
+ addq $128, %r14
+ subl $4, %r10d
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 1b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger8_add_4r_lib4, .-kernel_dger8_add_4r_lib4
+#endif
+
+
+
+
+
+#if 0
+// 1 2 3 4 5
+// void kernel_dger8_sub_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger8_add_4r_lib4
+ .type kernel_dger8_add_4r_lib4, @function
+kernel_dger8_add_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger8_add_4r_lib4
+_kernel_dger8_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger8_add_4r_lib4
+ .def kernel_dger8_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger8_add_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ sall $5, %r13d
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+ vmovapd 128(%r11), %ymm4
+ vmovapd 160(%r11), %ymm5
+ vmovapd 192(%r11), %ymm6
+ vmovapd 224(%r11), %ymm7
+
+ cmpl $7, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // 04
+ vmovapd 0(%r14), %ymm12
+ vbroadcastsd 0(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 8(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 16(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 24(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 0(%r14)
+
+ // 14
+ vmovapd 32(%r14), %ymm12
+ vbroadcastsd 32(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 40(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 48(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 56(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 32(%r14)
+
+ // 24
+ vmovapd 64(%r14), %ymm12
+ vbroadcastsd 64(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 72(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 80(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 88(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 64(%r14)
+
+ // 34
+ vmovapd 96(%r14), %ymm12
+ vbroadcastsd 96(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 104(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 112(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 120(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 96(%r14)
+
+ // 44
+ vmovapd 128(%r14), %ymm12
+ vbroadcastsd 128(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 136(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 144(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 152(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 128(%r14)
+
+ // 54
+ vmovapd 160(%r14), %ymm12
+ vbroadcastsd 160(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 168(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 176(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 184(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 160(%r14)
+
+ // 64
+ vmovapd 192(%r14), %ymm12
+ vbroadcastsd 192(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 200(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 208(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 216(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 192(%r14)
+
+ // 74
+ vmovapd 224(%r14), %ymm12
+ vbroadcastsd 224(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 232(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 240(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 248(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 224(%r14)
+
+ // 08
+ vmovapd 0(%r14), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 0(%r14)
+
+ // 18
+ vmovapd 32(%r14), %ymm12
+ vbroadcastsd 32(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 40(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 48(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 56(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 32(%r14)
+
+ // 28
+ vmovapd 64(%r14), %ymm12
+ vbroadcastsd 64(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 71(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 80(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 88(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 64(%r14)
+
+ // 38
+ vmovapd 96(%r14), %ymm12
+ vbroadcastsd 96(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 104(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 112(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 120(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 96(%r14)
+
+ // 48
+ vmovapd 128(%r14), %ymm12
+ vbroadcastsd 128(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 136(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 144(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 152(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 128(%r14)
+
+ // 58
+ vmovapd 160(%r14), %ymm12
+ vbroadcastsd 160(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 168(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 176(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 184(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 160(%r14)
+
+ // 68
+ vmovapd 192(%r14), %ymm12
+ vbroadcastsd 192(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 200(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 208(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 216(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 192(%r14)
+
+ // 78
+ vmovapd 224(%r14), %ymm12
+ vbroadcastsd 224(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 232(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 240(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 248(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 224(%r14)
+
+ addq $256, %r12
+ addq $256, %r14
+ subl $8, %r10d
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm12
+ vbroadcastsd 0(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 8(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 16(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 24(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger8_add_4r_lib4, .-kernel_dger8_add_4r_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger4_sub_4_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_lib4
+ .type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_lib4
+ .def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger2_sub_4_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger2_sub_4r_lib4
+ .type kernel_dger2_sub_4r_lib4, @function
+kernel_dger2_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger2_sub_4r_lib4
+_kernel_dger2_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger2_sub_4r_lib4
+ .def kernel_dger2_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger2_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ addq $128, %r13
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger2_sub_4r_lib4, .-kernel_dger2_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC00(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmaskmovpd 0(%r11), %ymm15, %ymm0
+ vmaskmovpd 32(%r11), %ymm15, %ymm1
+ vmaskmovpd 64(%r11), %ymm15, %ymm2
+ vmaskmovpd 96(%r11), %ymm15, %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+ .align 5
+#endif
+ .double 0.5
+ .double 1.5
+ .double 2.5
+ .double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double 4.5
+ .double 5.5
+ .double 6.5
+ .double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 8.5
+ .double 9.5
+ .double 10.5
+ .double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgelqf_4_lib4.S b/kernel/avx2/kernel_dgelqf_4_lib4.S
new file mode 100644
index 0000000..2f8b1be
--- /dev/null
+++ b/kernel/avx2/kernel_dgelqf_4_lib4.S
@@ -0,0 +1,5728 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dgelqf_dlarft12_12_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft12_12_lib4
+ .type kernel_dgelqf_dlarft12_12_lib4, @function
+kernel_dgelqf_dlarft12_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft12_12_lib4
+_kernel_dgelqf_dlarft12_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft12_12_lib4
+ .def kernel_dgelqf_dlarft12_12_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft12_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG5, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r14 // sdd
+ sall $5, %r14d
+ movq ARG4, %r12 // dD
+ movq ARG5, %r13 // T
+ movq $384, %r15 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+ // fourth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vbroadcastsd 120(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+// vpermpd $0x00, %ymm15, %ymm15 // beta
+
+ // fifth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 32(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 0(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 32(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 128(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11, %r14, 1), %ymm8
+ vbroadcastsd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 96(%r11, %r14, 1), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11, %r14, 1)
+ vmovsd %xmm9, 64(%r11, %r14, 1)
+ vmovsd %xmm10, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11, %r14, 1), %ymm8
+ vbroadcastsd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 64(%r11, %r14, 1), %ymm10
+ vbroadcastsd 96(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 1)
+ vmovsd %xmm9, 32(%r11, %r14, 1)
+ vmovsd %xmm10, 64(%r11, %r14, 1)
+ vmovsd %xmm11, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vbroadcastsd 128(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+// vmovapd 128(%r13), %ymm0
+// vblendpd $0xf, %ymm15, %ymm0, %ymm15
+ vmovapd %ymm15, 128(%r13)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // sixth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 40(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 40(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 168(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11, %r14, 1), %ymm9
+ vbroadcastsd 104(%r11, %r14, 1), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11, %r14, 1)
+ vmovsd %xmm10, 104(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11, %r14, 1), %ymm8
+ vbroadcastsd 40(%r11, %r14, 1), %ymm9
+ vbroadcastsd 72(%r11, %r14, 1), %ymm10
+ vbroadcastsd 104(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 1)
+ vmovsd %xmm9, 40(%r11, %r14, 1)
+ vmovsd %xmm10, 72(%r11, %r14, 1)
+ vmovsd %xmm11, 104(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 128(%r13), %ymm14
+ vmovapd 128(%r13, %r15, 1), %ymm11
+ vblendpd $0x1, %ymm11, %ymm12, %ymm11
+ vpermpd $0x00, %ymm1, %ymm13 // vv
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmulpd %ymm11, %ymm13, %ymm11
+ //
+ vbroadcastsd 168(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 160(%r13, %r15, 1), %ymm0
+ vblendpd $0x1, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 160(%r13)
+ vmovapd %ymm11, 160(%r13, %r15, 1)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // seventh column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 80(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 48(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 208(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11, %r14, 1), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11, %r14, 1), %ymm8
+ vbroadcastsd 48(%r11, %r14, 1), %ymm9
+ vbroadcastsd 80(%r11, %r14, 1), %ymm10
+ vbroadcastsd 112(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 1)
+ vmovsd %xmm9, 48(%r11, %r14, 1)
+ vmovsd %xmm10, 80(%r11, %r14, 1)
+ vmovsd %xmm11, 112(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13 // vv
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13 // vv
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vbroadcastsd 208(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 192(%r13, %r15, 1), %ymm0
+ vblendpd $0x3, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 192(%r13)
+ vmovapd %ymm11, 192(%r13, %r15, 1)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+ // eight column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 120(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 56(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 248(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11, %r14, 1), %ymm8
+ vbroadcastsd 56(%r11, %r14, 1), %ymm9
+ vbroadcastsd 88(%r11, %r14, 1), %ymm10
+ vbroadcastsd 120(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 1)
+ vmovsd %xmm9, 56(%r11, %r14, 1)
+ vmovsd %xmm10, 88(%r11, %r14, 1)
+ vmovsd %xmm11, 120(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vbroadcastsd 248(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+// vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 224(%r13, %r15, 1), %ymm0
+ vblendpd $0x7, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 224(%r13)
+ vmovapd %ymm11, 224(%r13, %r15, 1)
+
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0xf, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+// vpermpd $0x00, %ymm15, %ymm15 // beta
+
+ // ninth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 0(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 64(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 256(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11, %r14, 2), %ymm8
+ vbroadcastsd 64(%r11, %r14, 2), %ymm9
+ vbroadcastsd 96(%r11, %r14, 2), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11, %r14, 2)
+ vmovsd %xmm9, 64(%r11, %r14, 2)
+ vmovsd %xmm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11, %r14, 2), %ymm8
+ vbroadcastsd 32(%r11, %r14, 2), %ymm9
+ vbroadcastsd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 2)
+ vmovsd %xmm9, 32(%r11, %r14, 2)
+ vmovsd %xmm10, 64(%r11, %r14, 2)
+ vmovsd %xmm11, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vbroadcastsd 256(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+// vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+// vmovapd 224(%r13, %r15, 1), %ymm0
+// vblendpd $0xf, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 256(%r13)
+ vmovapd %ymm11, 256(%r13, %r15, 1)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm2, %ymm2
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // tenth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 40(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 72(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 296(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11, %r14, 2), %ymm9
+ vbroadcastsd 104(%r11, %r14, 2), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11, %r14, 2)
+ vmovsd %xmm10, 104(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11, %r14, 2), %ymm8
+ vbroadcastsd 40(%r11, %r14, 2), %ymm9
+ vbroadcastsd 72(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 2)
+ vmovsd %xmm9, 40(%r11, %r14, 2)
+ vmovsd %xmm10, 72(%r11, %r14, 2)
+ vmovsd %xmm11, 104(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x00, %ymm2, %ymm13
+ vmovapd 256(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 256(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 256(%r13, %r15, 2), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm10
+ //
+ vbroadcastsd 296(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm10, %ymm10
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 288(%r13, %r15, 2), %ymm0
+ vblendpd $0x1, %ymm10, %ymm0, %ymm10
+ vmovapd %ymm15, 288(%r13)
+ vmovapd %ymm11, 288(%r13, %r15, 1)
+ vmovapd %ymm10, 288(%r13, %r15, 2)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm2, %ymm2
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // eleventh column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 80(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 80(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 336(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11, %r14, 2), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11, %r14, 2), %ymm8
+ vbroadcastsd 48(%r11, %r14, 2), %ymm9
+ vbroadcastsd 80(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 2)
+ vmovsd %xmm9, 48(%r11, %r14, 2)
+ vmovsd %xmm10, 80(%r11, %r14, 2)
+ vmovsd %xmm11, 112(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x00, %ymm2, %ymm13
+ vmovapd 256(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 256(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 256(%r13, %r15, 2), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm10
+ //
+ vpermpd $0x55, %ymm2, %ymm13
+ vmovapd 288(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 288(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 288(%r13, %r15, 2), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm10
+ //
+ vbroadcastsd 336(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm10, %ymm10
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 320(%r13, %r15, 2), %ymm0
+ vblendpd $0x3, %ymm10, %ymm0, %ymm10
+ vmovapd %ymm15, 320(%r13)
+ vmovapd %ymm11, 320(%r13, %r15, 1)
+ vmovapd %ymm10, 320(%r13, %r15, 2)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm2, %ymm2
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+ // twelveth
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 120(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 88(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 376(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11, %r14, 2), %ymm8
+ vbroadcastsd 56(%r11, %r14, 2), %ymm9
+ vbroadcastsd 88(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 2)
+ vmovsd %xmm9, 56(%r11, %r14, 2)
+ vmovsd %xmm10, 88(%r11, %r14, 2)
+ vmovsd %xmm11, 120(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x00, %ymm2, %ymm13
+ vmovapd 256(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 256(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 256(%r13, %r15, 2), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm10
+ //
+ vpermpd $0x55, %ymm2, %ymm13
+ vmovapd 288(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 288(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 288(%r13, %r15, 2), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm10
+ //
+ vpermpd $0xaa, %ymm2, %ymm13
+ vmovapd 320(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 320(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 320(%r13, %r15, 2), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm10
+ //
+ vbroadcastsd 376(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm10, %ymm10
+// vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 352(%r13, %r15, 2), %ymm0
+ vblendpd $0x7, %ymm10, %ymm0, %ymm10
+ vmovapd %ymm15, 352(%r13)
+ vmovapd %ymm11, 352(%r13, %r15, 1)
+ vmovapd %ymm10, 352(%r13, %r15, 2)
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft12_12_lib4, .-kernel_dgelqf_dlarft12_12_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dgelqf_dlarft4_12_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft4_12_lib4
+ .type kernel_dgelqf_dlarft4_12_lib4, @function
+kernel_dgelqf_dlarft4_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft4_12_lib4
+_kernel_dgelqf_dlarft4_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft4_12_lib4
+ .def kernel_dgelqf_dlarft4_12_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG5, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r14 // sdd
+ sall $5, %r14d
+ movq ARG4, %r12 // dD
+ movq ARG5, %r13 // T
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vbroadcastsd 120(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft4_12_lib4, .-kernel_dgelqf_dlarft4_12_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dgelqf_dlarft4_8_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft4_8_lib4
+ .type kernel_dgelqf_dlarft4_8_lib4, @function
+kernel_dgelqf_dlarft4_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft4_8_lib4
+_kernel_dgelqf_dlarft4_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft4_8_lib4
+ .def kernel_dgelqf_dlarft4_8_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG5, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r14 // sdd
+ sall $5, %r14d
+ movq ARG4, %r12 // dD
+ movq ARG5, %r13 // T
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vbroadcastsd 120(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft4_8_lib4, .-kernel_dgelqf_dlarft4_8_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT, double *beta)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft4_4_lib4
+ .type kernel_dgelqf_dlarft4_4_lib4, @function
+kernel_dgelqf_dlarft4_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft4_4_lib4
+_kernel_dgelqf_dlarft4_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft4_4_lib4
+ .def kernel_dgelqf_dlarft4_4_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG4, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r12 // dD
+ movq ARG4, %r13 // T
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vaddpd %ymm0, %ymm8, %ymm8
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vmulpd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vaddpd %ymm0, %ymm9, %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vmulpd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm1
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm1
+ vblendpd $0x7, %ymm1, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vaddpd %ymm0, %ymm10, %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm1
+
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm1
+
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm1
+
+ vbroadcastsd 120(%r13), %ymm15
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm1, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft4_4_lib4, .-kernel_dgelqf_dlarft4_4_lib4
+#endif
+
+
+
+
+
+// 1 2
+// void kernel_dgelqf_dlarft_12_12_lib4(double *dK, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb_12_lib4
+ .type kernel_dlarfb_12_lib4, @function
+kernel_dlarfb_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb_12_lib4
+_kernel_dlarfb_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb_12_lib4
+ .def kernel_dlarfb_12_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb_12_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // K
+ movq ARG2, %r11 // T
+ movq $384, %r12 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+ //
+ vmovapd 352(%r10), %ymm12
+ vbroadcastsd 376(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm11
+ //
+ vmovapd 320(%r10), %ymm12
+ vbroadcastsd 368(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm10
+ //
+ vmovapd 288(%r10), %ymm12
+ vbroadcastsd 360(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm9
+ //
+ vmovapd 256(%r10), %ymm12
+ vbroadcastsd 352(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm8
+ //
+ vmovapd 224(%r10), %ymm12
+ vbroadcastsd 376(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 344(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 312(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 280(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 248(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm7
+ //
+ vmovapd 192(%r10), %ymm12
+ vbroadcastsd 368(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 304(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 272(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 240(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 208(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm6
+ //
+ vmovapd 160(%r10), %ymm12
+ vbroadcastsd 360(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 264(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 232(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 200(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 168(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm5
+ //
+ vmovapd 128(%r10), %ymm12
+ vbroadcastsd 352(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 224(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 192(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 160(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 128(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm4
+ //
+ vmovapd 96(%r10), %ymm12
+ vbroadcastsd 376(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 344(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 312(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 280(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 248(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 216(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 184(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 152(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 120(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm3
+ //
+ vmovapd 64(%r10), %ymm12
+ vbroadcastsd 368(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 336(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 304(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 272(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 240(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 208(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 176(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 144(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm2
+ //
+ vmovapd 32(%r10), %ymm12
+ vbroadcastsd 360(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 328(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 296(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 264(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 232(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 200(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 168(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 136(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm1
+ //
+ vmovapd 0(%r10), %ymm12
+ vbroadcastsd 352(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 320(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 288(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 256(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 224(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 192(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 160(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 128(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm0
+
+ vmovapd %ymm11, 352(%r10)
+ vmovapd %ymm10, 320(%r10)
+ vmovapd %ymm9, 288(%r10)
+ vmovapd %ymm8, 256(%r10)
+ vmovapd %ymm7, 224(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm0, 0(%r10)
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb_12_lib4, .-kernel_dlarfb_12_lib4
+#endif
+
+
+
+
+
+// read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 100...0 100...0 100...0 100...0 }
+#elif defined(OS_MAC)
+LC00: // { 100...0 100...0 100...0 100...0 }
+ .align 5
+#endif
+ .long 0x00000000
+ .long 0x80000000
+ .long 0x00000000
+ .long 0x80000000
+ .long 0x00000000
+ .long 0x80000000
+ .long 0x00000000
+ .long 0x80000000
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double -1.0
+ .double -1.0
+ .double -1.0
+ .double -1.0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 1.0
+ .double 1.0
+ .double 1.0
+ .double 1.0
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c b/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c
new file mode 100644
index 0000000..05c2d2e
--- /dev/null
+++ b/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c
@@ -0,0 +1,282 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ return;
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = -1.0 / (beta-alpha);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ w3 = - dD[2] * w3;
+//printf("\n%f %f %f\n", pT[0+ps*2], pT[1+ps*2], w3);
+//return;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ return;
+ }
+
+
+
+
diff --git a/kernel/avx2/kernel_dgemm_12x4_lib4.S b/kernel/avx2/kernel_dgemm_12x4_lib4.S
new file mode 100644
index 0000000..766cb92
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_12x4_lib4.S
@@ -0,0 +1,15536 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_12x4_lib4, @function
+inner_kernel_dgemm_add_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_12x4_lib4:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vmovapd 0(%r13), %ymm15 // B[0]
+ vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 32(%r11), %ymm12 // A0[4]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 32(%r11, %r12, 1), %ymm13 // A1[4]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 32(%r13), %ymm15 // B[4]
+ vmovapd 32(%r11, %r12, 2), %ymm14 // A2[4]
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 64(%r11), %ymm12 // A0[8]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 64(%r11, %r12, 1), %ymm13 // A1[8]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 64(%r13), %ymm15 // B[8]
+ vmovapd 64(%r11, %r12, 2), %ymm14 // A2[8]
+
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r11), %ymm12 // A0[12]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm13 // A1[12]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r13), %ymm15 // B[12]
+ vmovapd 96(%r11, %r12, 2), %ymm14 // A2[12]
+
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ addq $128, %r11
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 0(%r13), %ymm15 // B[0]
+ vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 32(%r11), %ymm12 // A0[4]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 32(%r11, %r12, 1), %ymm13 // A1[4]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 32(%r13), %ymm15 // B[4]
+ vmovapd 32(%r11, %r12, 2), %ymm14 // A2[4]
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 64(%r11), %ymm12 // A0[8]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 64(%r11, %r12, 1), %ymm13 // A1[8]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 64(%r13), %ymm15 // B[8]
+ vmovapd 64(%r11, %r12, 2), %ymm14 // A2[8]
+
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r11), %ymm12 // A0[12]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm13 // A1[12]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r13), %ymm15 // B[12]
+ vmovapd 96(%r11, %r12, 2), %ymm14 // A2[12]
+
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ addq $128, %r11
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+// cmpl $4, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+// vmovapd 0(%r11), %ymm12 // A0[0]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+// vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+// vmovapd 0(%r13), %ymm15 // B[0]
+// vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm12 // A0[4]
+ vmovapd 0(%r11, %r12, 1), %ymm13 // A1[4]
+ vmovapd 0(%r13), %ymm15 // B[4]
+ vmovapd 0(%r11, %r12, 2), %ymm14 // A2[4]
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ addq $32, %r11
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ subl $1, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_12x4_lib4, .-inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_12x4_lib4, @function
+inner_kernel_dgemm_sub_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_12x4_lib4, .-inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_12x4_lib4, @function
+inner_kernel_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_12x4_lib4, .-inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_12x4_lib4, @function
+inner_kernel_dgemm_sub_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_12x4_lib4, .-inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x12_lib4, @function
+inner_kernel_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+ prefetcht0 256(%r12, %r13, 2) // software prefetch
+ prefetcht0 320(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 264(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 296(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 328(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 360(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 272(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 304(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 336(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 368(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 280(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 312(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 344(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 376(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 264(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 296(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 328(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 360(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 272(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 304(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 336(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 368(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 280(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 312(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 344(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 376(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x12_lib4, .-inner_kernel_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_12x4_lib4, @function
+inner_kernel_dgebp_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ vmovapd %ymm15, 0(%r12, %r13, 2)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vmovapd 32(%r12, %r13, 2), %ymm15
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 40(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 48(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 56(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ vmovapd %ymm15, 32(%r12, %r13, 2)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vmovapd 64(%r12, %r13, 2), %ymm15
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 80(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 88(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ vmovapd %ymm15, 64(%r12, %r13, 2)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vmovapd 96(%r12, %r13, 2), %ymm15
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd -24(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd -16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+ vmovapd %ymm15, -32(%r12, %r13, 2)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 8(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ vmovapd %ymm15, 0(%r12, %r13, 2)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_12x4_lib4, .-inner_kernel_dgebp_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 32*sdb
+// r14 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A
+// r12 <- B+?
+// r13 <- 32*sdb
+// r14 <- C+?
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x12_lib4, @function
+inner_kernel_dgebp_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $11, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ addq $384, %r12
+ addq $384, %r14
+ subl $12, %r10d
+
+ cmpl $11, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $3, %r10d
+ jle 2f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+
+ addq $128, %r12
+ addq $128, %r14
+ subl $4, %r10d
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 1b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x12_lib4, .-inner_kernel_dgebp_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_12x4_lib4, @function
+inner_edge_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(double)
+
+ movq %rax, %rbp // A2 <- A1
+ addq %r12, %rbp // A2 <- A1 + 4*sda*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vmovapd 0(%rax), %ymm14 // A1[0]
+ vmovapd 0(%rbp), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm13 // B[0]
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vfmadd231pd %ymm14, %ymm13, %ymm4
+ vfmadd231pd %ymm15, %ymm13, %ymm8
+ vbroadcastsd 32(%r13), %ymm13 // B[1]
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vfmadd231pd %ymm14, %ymm13, %ymm5
+ vfmadd231pd %ymm15, %ymm13, %ymm9
+ vbroadcastsd 64(%r13), %ymm13 // B[2]
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vfmadd231pd %ymm14, %ymm13, %ymm6
+ vfmadd231pd %ymm15, %ymm13, %ymm10
+ vbroadcastsd 96(%r13), %ymm13 // B[3]
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vfmadd231pd %ymm14, %ymm13, %ymm7
+ vfmadd231pd %ymm15, %ymm13, %ymm11
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $32, %rax // A1+1*bs*sizeof(float)
+ addq $32, %rbp // A2+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_12x4_lib4, .-inner_edge_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x12_lib4, @function
+inner_edge_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x12_lib4, .-inner_edge_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_12x4_lib4, @function
+inner_edge_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // A1 <- A0
+ addq %r11, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ movq %r15, %r14 // A2 <- A1
+ addq %r11, %r14 // A2 <- A1 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 64(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 96(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 96(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 104(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_12x4_lib4, .-inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_12x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ movq %r15, %r14 // A2 <- A1
+ addq %r12, %r14 // A2 <- A1 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r13), %ymm12
+ addq $32, %r13
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ addq $32, %r11
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ addq $32, %r15
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ addq $32, %r14
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ addq $32, %r15
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ addq $32, %r11
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ addq $32, %r15
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ addq $32, %r11
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $32, %r15
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ addq $32, %r14
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_12x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d40 d50 d60 d70]
+// ymm9 <- [d41 d51 d61 d71]
+// ymm10 <- [d42 d52 d62 d72]
+// ymm11 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_12x4_lib4, @function
+inner_edge_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vmovapd 96(%r11), %ymm13
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vmovapd 96(%r11, %r12, 2), %ymm15
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ vmovapd 96(%r11), %ymm13
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vmovapd 96(%r11, %r12, 2), %ymm15
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ vmovapd 96(%r11), %ymm13
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vmovapd 96(%r11, %r12, 2), %ymm15
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_12x4_lib4, .-inner_edge_dtrmm_nn_rl_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_12x4_vs_lib4, @function
+inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_12x4_vs_lib4, .-inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_12x4_lib4, @function
+inner_blend_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_12x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm12
+ vblendpd $0x5, %ymm1, %ymm0, %ymm13
+ vblendpd $0xa, %ymm3, %ymm2, %ymm14
+ vblendpd $0x5, %ymm3, %ymm2, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm0
+ vblendpd $0x3, %ymm14, %ymm12, %ymm2
+ vblendpd $0xc, %ymm15, %ymm13, %ymm1
+ vblendpd $0x3, %ymm15, %ymm13, %ymm3
+
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm12
+ vblendpd $0x5, %ymm5, %ymm4, %ymm13
+ vblendpd $0xa, %ymm7, %ymm6, %ymm14
+ vblendpd $0x5, %ymm7, %ymm6, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm4
+ vblendpd $0x3, %ymm14, %ymm12, %ymm6
+ vblendpd $0xc, %ymm15, %ymm13, %ymm5
+ vblendpd $0x3, %ymm15, %ymm13, %ymm7
+
+
+ vblendpd $0xa, %ymm9, %ymm8, %ymm12
+ vblendpd $0x5, %ymm9, %ymm8, %ymm13
+ vblendpd $0xa, %ymm11, %ymm10, %ymm14
+ vblendpd $0x5, %ymm11, %ymm10, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm8
+ vblendpd $0x3, %ymm14, %ymm12, %ymm10
+ vblendpd $0xc, %ymm15, %ymm13, %ymm9
+ vblendpd $0x3, %ymm15, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_12x4_lib4, .-inner_blend_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_12x4_lib4, @function
+inner_tran_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_tran_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_12x4_lib4; .scl 2; .type 32; .endef
+inner_tran_12x4_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_12x4_lib4, .-inner_tran_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 d63 db3]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 d63 db3]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_12x4_lib4, @function
+inner_scale_11_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_12x4_lib4:
+#endif
+#endif
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC05(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_12x4_lib4, .-inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_12x4_lib4, @function
+inner_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r10), %ymm15 // beta
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ movq %r15, %r14 // C2 <- C1
+ addq %r13, %r14 // C2 <- C1 + 4*sdc*sizeof(double)
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- &alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_12x4_lib4, @function
+inner_scale_a0_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r10), %ymm15 // beta
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_12x4_lib4, .-inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_12x4_lib4, @function
+inner_blend_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_12x4_lib4:
+#endif
+#endif
+
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm12
+ vblendpd $0x5, %ymm1, %ymm0, %ymm13
+ vblendpd $0xa, %ymm3, %ymm2, %ymm14
+ vblendpd $0x5, %ymm3, %ymm2, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm0
+ vblendpd $0x3, %ymm14, %ymm12, %ymm2
+ vblendpd $0xc, %ymm15, %ymm13, %ymm1
+ vblendpd $0x3, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm12
+ vblendpd $0x5, %ymm5, %ymm4, %ymm13
+ vblendpd $0xa, %ymm7, %ymm6, %ymm14
+ vblendpd $0x5, %ymm7, %ymm6, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm4
+ vblendpd $0x3, %ymm14, %ymm12, %ymm6
+ vblendpd $0xc, %ymm15, %ymm13, %ymm5
+ vblendpd $0x3, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vblendpd $0xa, %ymm9, %ymm8, %ymm12
+ vblendpd $0x5, %ymm9, %ymm8, %ymm13
+ vblendpd $0xa, %ymm11, %ymm10, %ymm14
+ vblendpd $0x5, %ymm11, %ymm10, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm8
+ vblendpd $0x3, %ymm14, %ymm12, %ymm10
+ vblendpd $0xc, %ymm15, %ymm13, %ymm9
+ vblendpd $0x3, %ymm15, %ymm13, %ymm11
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_12x4_lib4, .-inner_blend_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x12_lib4, @function
+inner_scale_ab_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x12_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x12_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+ vmovapd 256(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 288(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 320(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 352(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x12_lib4, .-inner_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x12_lib4, @function
+inner_tran_scale_ab_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x12_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x12_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 256(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 288(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 320(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 352(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x12_lib4, .-inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_12x4_lib4, @function
+inner_blend_scale_11_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_12x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm12
+ vblendpd $0x5, %ymm1, %ymm0, %ymm13
+ vblendpd $0xa, %ymm3, %ymm2, %ymm14
+ vblendpd $0x5, %ymm3, %ymm2, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm0
+ vblendpd $0x3, %ymm14, %ymm12, %ymm2
+ vblendpd $0xc, %ymm15, %ymm13, %ymm1
+ vblendpd $0x3, %ymm15, %ymm13, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm12
+ vblendpd $0x5, %ymm5, %ymm4, %ymm13
+ vblendpd $0xa, %ymm7, %ymm6, %ymm14
+ vblendpd $0x5, %ymm7, %ymm6, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm4
+ vblendpd $0x3, %ymm14, %ymm12, %ymm6
+ vblendpd $0xc, %ymm15, %ymm13, %ymm5
+ vblendpd $0x3, %ymm15, %ymm13, %ymm7
+
+ vblendpd $0xa, %ymm9, %ymm8, %ymm12
+ vblendpd $0x5, %ymm9, %ymm8, %ymm13
+ vblendpd $0xa, %ymm11, %ymm10, %ymm14
+ vblendpd $0x5, %ymm11, %ymm10, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm8
+ vblendpd $0x3, %ymm14, %ymm12, %ymm10
+ vblendpd $0xc, %ymm15, %ymm13, %ymm9
+ vblendpd $0x3, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC05(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_12x4_lib4, .-inner_blend_scale_11_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_4x12_lib4, @function
+inner_tran_scale_11_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_4x12_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x12_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC05(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 256(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 288(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 320(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 352(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_4x12_lib4, .-inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_12x4_vs_lib4, @function
+inner_edge_dpotrf_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_12x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $2, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm0, %ymm0, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $4, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm2, %ymm2, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+// vextractf128 $0x1, %ymm3, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+ #if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_12x4_vs_lib4, .-inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_12x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_12x4_lib4, .-inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x12_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x12_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 80(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 88(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x12_lib4, .-inner_edge_dtrsm_rlt_inv_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_12x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x12_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $10, %r13d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $11, %r13d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 80(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $12, %r13d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 88(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x12_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_12x4_lib4, @function
+inner_edge_dtrsm_rlt_one_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_12x4_lib4, .-inner_edge_dtrsm_rlt_one_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_12x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_12x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_12x4_lib4, @function
+inner_edge_dtrsm_rut_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vfnmadd231pd %ymm11, %ymm12, %ymm10
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vfnmadd231pd %ymm11, %ymm12, %ymm9
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+ vfnmadd231pd %ymm11, %ymm12, %ymm8
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vfnmadd231pd %ymm10, %ymm12, %ymm9
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+ vfnmadd231pd %ymm10, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+ vfnmadd231pd %ymm9, %ymm12, %ymm8
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_12x4_lib4, .-inner_edge_dtrsm_rut_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_12x4_lib4, @function
+inner_edge_dtrsm_run_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_12x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm1
+ vfnmadd231pd %ymm4, %ymm12, %ymm5
+ vfnmadd231pd %ymm8, %ymm12, %ymm9
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm2
+ vfnmadd231pd %ymm4, %ymm12, %ymm6
+ vfnmadd231pd %ymm8, %ymm12, %ymm10
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm2
+ vfnmadd231pd %ymm5, %ymm12, %ymm6
+ vfnmadd231pd %ymm9, %ymm12, %ymm10
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm3
+ vfnmadd231pd %ymm4, %ymm12, %ymm7
+ vfnmadd231pd %ymm8, %ymm12, %ymm11
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm3
+ vfnmadd231pd %ymm5, %ymm12, %ymm7
+ vfnmadd231pd %ymm9, %ymm12, %ymm11
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm3
+ vfnmadd231pd %ymm6, %ymm12, %ymm7
+ vfnmadd231pd %ymm10, %ymm12, %ymm11
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_12x4_lib4, .-inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vfnmadd231pd %ymm11, %ymm12, %ymm10
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vfnmadd231pd %ymm11, %ymm12, %ymm9
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+ vfnmadd231pd %ymm11, %ymm12, %ymm8
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vfnmadd231pd %ymm10, %ymm12, %ymm9
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+ vfnmadd231pd %ymm10, %ymm12, %ymm8
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+ vfnmadd231pd %ymm9, %ymm12, %ymm8
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_12x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_12x4_lib4, @function
+inner_edge_dtrsm_lln_one_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r12 // E1 <- E0
+ addq %r11, %r12 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r12, %r13 // E2 <- E1
+ addq %r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // left block-column
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm15, %ymm12, %ymm12
+ vmovapd 0(%r12), %ymm14
+ vmovapd 0(%r13), %ymm15
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm15, %ymm12, %ymm12
+ vmovapd 32(%r12), %ymm14
+ vmovapd 32(%r13), %ymm15
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm15, %ymm12, %ymm12
+ vmovapd 64(%r12), %ymm14
+ vmovapd 64(%r13), %ymm15
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vmovapd 96(%r12), %ymm14
+ vmovapd 96(%r13), %ymm15
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ addq $128, %r12
+ addq $128, %r13
+
+
+ // middle block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 0(%r12), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r13), %ymm14
+ vpermpd $0x00, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 32(%r12), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm14
+ vpermpd $0x55, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x55, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 64(%r12), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r13), %ymm14
+ vpermpd $0xaa, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm4, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xff, %ymm5, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xff, %ymm6, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xff, %ymm7, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+
+ addq $128, %r13
+
+
+ // right block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r13), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vpermpd $0x00, %ymm8, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vpermpd $0x00, %ymm9, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vpermpd $0x00, %ymm10, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vpermpd $0x00, %ymm11, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ vmovapd 32(%r13), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vpermpd $0x55, %ymm8, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vpermpd $0x55, %ymm9, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vpermpd $0x55, %ymm10, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vpermpd $0x55, %ymm11, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ vmovapd 64(%r13), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vpermpd $0xaa, %ymm8, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm9, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm10, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm11, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_12x4_lib4, .-inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_12x4_lib4, @function
+inner_edge_dtrsm_lun_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r13 // E1 <- E0
+ addq %r11, %r13 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r13, %r14 // E2 <- E1
+ addq %r11, %r14 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ vmovapd 352(%r14), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 88(%r12), %ymm12
+ vmovapd 352(%r13), %ymm15
+// vmovapd 352(%r10), %ymm11
+
+ vpermpd $0xff, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 352(%r10), %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 352(%r10), %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 352(%r10), %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 352(%r10), %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 320(%r14), %xmm13
+ vbroadcastsd 80(%r12), %ymm12
+ vmovapd 320(%r13), %ymm15
+// vmovapd 320(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 320(%r10), %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 320(%r10), %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 320(%r10), %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 320(%r10), %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 288(%r14), %xmm13
+ vbroadcastsd 72(%r12), %ymm12
+ vmovapd 288(%r13), %ymm15
+// vmovapd 288(%r10), %ymm11
+
+ vpermpd $0x55, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 288(%r10), %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 288(%r10), %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 288(%r10), %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 288(%r10), %ymm14, %ymm3
+
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 256(%r13), %ymm15
+// vmovapd 256(%r10), %ymm11
+
+ vpermpd $0x00, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 256(%r10), %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 256(%r10), %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 256(%r10), %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 256(%r10), %ymm14, %ymm3
+
+
+ // middle-middle
+
+ vmovapd 224(%r13), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm15
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r13), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm15
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r13), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm15
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm15
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_12x4_lib4, .-inner_edge_dtrsm_lun_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // E1 <- E0
+ addq %r11, %r15 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r15, %r14 // E2 <- E1
+ addq %r11, %r14 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ cmpl $11, %r13d
+ jle 0f
+
+ vmovapd 352(%r14), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 88(%r12), %ymm12
+ vmovapd 352(%r15), %ymm15
+// vmovapd 352(%r10), %ymm11
+
+ vpermpd $0xff, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 352(%r10), %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 352(%r10), %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 352(%r10), %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 352(%r10), %ymm14, %ymm3
+
+0:
+ cmpl $10, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 320(%r14), %xmm13
+ vbroadcastsd 80(%r12), %ymm12
+ vmovapd 320(%r15), %ymm15
+// vmovapd 320(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 320(%r10), %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 320(%r10), %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 320(%r10), %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 320(%r10), %ymm14, %ymm3
+
+1:
+ cmpl $9, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 288(%r14), %xmm13
+ vbroadcastsd 72(%r12), %ymm12
+ vmovapd 288(%r15), %ymm15
+// vmovapd 288(%r10), %ymm11
+
+ vpermpd $0x55, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 288(%r10), %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 288(%r10), %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 288(%r10), %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 288(%r10), %ymm14, %ymm3
+
+2:
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 256(%r15), %ymm15
+// vmovapd 256(%r10), %ymm11
+
+ vpermpd $0x00, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 256(%r10), %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 256(%r10), %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 256(%r10), %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 256(%r10), %ymm14, %ymm3
+
+
+ // middle-middle
+
+ vmovapd 224(%r15), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm15
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r15), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm15
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r15), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm15
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm15
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_12x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_12x4_lib4, @function
+inner_edge_dgetrf_l_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_12x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm2, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_12x4_lib4, .-inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// middle kernel
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_m_12x4_lib4, @function
+inner_edge_dgetrf_m_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_m_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_m_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_m_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r14 // E1 <- E0
+ addq %r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r14, %r13 // E2 <- E1
+ addq %r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // solve upper 4x4 & correct lower 8x4
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm15, %ymm12, %ymm12
+ vmovapd 0(%r14), %ymm14
+ vmovapd 0(%r13), %ymm15
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm15, %ymm12, %ymm12
+ vmovapd 32(%r14), %ymm14
+ vmovapd 32(%r13), %ymm15
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm15, %ymm12, %ymm12
+ vmovapd 64(%r14), %ymm14
+ vmovapd 64(%r13), %ymm15
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vmovapd 96(%r14), %ymm14
+ vmovapd 96(%r13), %ymm15
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+
+ // factorize lower 8x4
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm4, %ymm12, %ymm12
+ vmovapd %ymm4, %ymm12
+ vdivsd %xmm4, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r12)
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vblendpd $0x1, %ymm12, %ymm4, %ymm4
+
+ // second column
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vblendpd $0x2, %ymm5, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm5, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r12)
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vblendpd $0x3, %ymm12, %ymm5, %ymm5
+
+ // third column
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vblendpd $0x2, %ymm6, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vblendpd $0x4, %ymm6, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm6, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r12)
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vblendpd $0x7, %ymm12, %ymm6, %ymm6
+
+ // fourth column
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vblendpd $0x2, %ymm7, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vblendpd $0x4, %ymm7, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+ vblendpd $0x8, %ymm7, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm7, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r12)
+// vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+ vblendpd $0x7, %ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_m_12x4_lib4, .-inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// right kernel
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_r_12x4_lib4, @function
+inner_edge_dgetrf_r_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_r_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_r_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_r_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r14 // E1 <- E0
+ addq %r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r14, %r13 // E2 <- E1
+ addq %r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // solve upper 8x4 & correct lower 4x4
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm15, %ymm12, %ymm12
+ vmovapd 0(%r14), %ymm14
+ vmovapd 0(%r13), %ymm15
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm15, %ymm12, %ymm12
+ vmovapd 32(%r14), %ymm14
+ vmovapd 32(%r13), %ymm15
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm15, %ymm12, %ymm12
+ vmovapd 64(%r14), %ymm14
+ vmovapd 64(%r13), %ymm15
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vmovapd 96(%r14), %ymm14
+ vmovapd 96(%r13), %ymm15
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ addq $128, %r14
+ addq $128, %r13
+
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 0(%r14), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r13), %ymm14
+ vpermpd $0x00, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 32(%r14), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm14
+ vpermpd $0x55, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x55, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 64(%r14), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r13), %ymm14
+ vpermpd $0xaa, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm4, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xff, %ymm5, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xff, %ymm6, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xff, %ymm7, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+
+
+ // factorize lower 8x4
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm8, %ymm12, %ymm12
+ vmovapd %ymm8, %ymm12
+ vdivsd %xmm8, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r12)
+ vmulpd %ymm8, %ymm13, %ymm8
+ vblendpd $0x1, %ymm12, %ymm8, %ymm8
+
+ // second column
+ vpermpd $0x00, %ymm9, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vblendpd $0x2, %ymm9, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm9, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r12)
+ vmulpd %ymm9, %ymm13, %ymm9
+ vblendpd $0x3, %ymm12, %ymm9, %ymm9
+
+ // third column
+ vpermpd $0x00, %ymm10, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vblendpd $0x2, %ymm10, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm10, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vblendpd $0x4, %ymm10, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm10, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r12)
+ vmulpd %ymm10, %ymm13, %ymm10
+ vblendpd $0x7, %ymm12, %ymm10, %ymm10
+
+ // fourth column
+ vpermpd $0x00, %ymm11, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vblendpd $0x2, %ymm11, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm11, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vblendpd $0x4, %ymm11, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm11, %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+ vblendpd $0x8, %ymm11, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm11, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r12)
+// vmulpd %ymm11, %ymm13, %ymm11
+ vblendpd $0x7, %ymm12, %ymm11, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_r_12x4_lib4, .-inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12x4_lib4, @function
+inner_store_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12x4_lib4; .scl 2; .type 32; .endef
+inner_store_12x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 0(%r10, %r11, 2)
+ vmovapd %ymm9, 32(%r10, %r11, 2)
+ vmovapd %ymm10, 64(%r10, %r11, 2)
+ vmovapd %ymm11, 96(%r10, %r11, 2)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x12_lib4, @function
+inner_store_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x12_lib4; .scl 2; .type 32; .endef
+inner_store_4x12_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+ vmovapd %ymm8, 256(%r10)
+ vmovapd %ymm9, 288(%r10)
+ vmovapd %ymm10, 320(%r10)
+ vmovapd %ymm11, 352(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x12_lib4, .-inner_store_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12x4_vs_lib4, @function
+inner_store_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_12x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC04(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmaskmovpd %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmaskmovpd %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmaskmovpd %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+ vmaskmovpd %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12x4_vs_lib4, .-inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X12_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x12_vs_lib4, @function
+inner_store_4x12_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x12_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x12_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+ vmaskmovpd %ymm8, %ymm15, 256(%r10)
+ cmpl $10, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm9, %ymm15, 288(%r10)
+ cmpl $11, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm10, %ymm15, 320(%r10)
+ je 0f // end
+ vmaskmovpd %ymm11, %ymm15, 352(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x12_vs_lib4, .-inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib4, @function
+inner_store_l_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 0(%r10, %r11, 2)
+ vmovapd %ymm9, 32(%r10, %r11, 2)
+ vmovapd %ymm10, 64(%r10, %r11, 2)
+ vmovapd %ymm11, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib4, .-inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib4, @function
+inner_store_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ movq %r15, %r14 // D2 <- D1
+ addq %r11, %r14 // D2 <- D1 + 4*sdd*sizeof(double)
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC04(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm4, 0(%r15)
+ vmaskmovpd %ymm8, %ymm15, 0(%r14)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm5, 32(%r15)
+ vmaskmovpd %ymm9, %ymm15, 32(%r14)
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm6, 64(%r15)
+ vmaskmovpd %ymm10, %ymm15, 64(%r14)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm7, 96(%r15)
+ vmaskmovpd %ymm11, %ymm15, 96(%r14)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib4, .-inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_12x4_lib4
+ .type kernel_dgemm_nt_12x4_lib4, @function
+kernel_dgemm_nt_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_12x4_lib4
+_kernel_dgemm_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_12x4_lib4
+ .def kernel_dgemm_nt_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_12x4_lib4, .-kernel_dgemm_nt_12x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x12_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x12_lib4
+ .type kernel_dgemm_nt_4x12_lib4, @function
+kernel_dgemm_nt_4x12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x12_lib4
+_kernel_dgemm_nt_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x12_lib4
+ .def kernel_dgemm_nt_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x12_lib4, .-kernel_dgemm_nt_4x12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_12x4_vs_lib4
+ .type kernel_dgemm_nt_12x4_vs_lib4, @function
+kernel_dgemm_nt_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_12x4_vs_lib4
+_kernel_dgemm_nt_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_12x4_vs_lib4
+ .def kernel_dgemm_nt_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_12x4_vs_lib4, .-kernel_dgemm_nt_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x12_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x12_vs_lib4
+ .type kernel_dgemm_nt_4x12_vs_lib4, @function
+kernel_dgemm_nt_4x12_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x12_vs_lib4
+_kernel_dgemm_nt_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x12_vs_lib4
+ .def kernel_dgemm_nt_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x12_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x12_vs_lib4, .-kernel_dgemm_nt_4x12_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_12x4_lib4
+ .type kernel_dgemm_nn_12x4_lib4, @function
+kernel_dgemm_nn_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_12x4_lib4
+_kernel_dgemm_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_12x4_lib4
+ .def kernel_dgemm_nn_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_12x4_lib4, .-kernel_dgemm_nn_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x12_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x12_lib4
+ .type kernel_dgemm_nn_4x12_lib4, @function
+kernel_dgemm_nn_4x12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x12_lib4
+_kernel_dgemm_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x12_lib4
+ .def kernel_dgemm_nn_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x12_lib4, .-kernel_dgemm_nn_4x12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dgemm_nn_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_12x4_vs_lib4
+ .type kernel_dgemm_nn_12x4_vs_lib4, @function
+kernel_dgemm_nn_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_12x4_vs_lib4
+_kernel_dgemm_nn_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_12x4_vs_lib4
+ .def kernel_dgemm_nn_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sda
+ sall $5, %r14d // 4*sda*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // store address D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_12x4_vs_lib4, .-kernel_dgemm_nn_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_12x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_12x4_lib4
+ .type kernel_dsyrk_nt_l_12x4_lib4, @function
+kernel_dsyrk_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_12x4_lib4
+_kernel_dsyrk_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_12x4_lib4
+ .def kernel_dsyrk_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_12x4_lib4, .-kernel_dsyrk_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_12x4_vs_lib4
+ .type kernel_dsyrk_nt_l_12x4_vs_lib4, @function
+kernel_dsyrk_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_12x4_vs_lib4
+_kernel_dsyrk_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_12x4_vs_lib4
+ .def kernel_dsyrk_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_12x4_vs_lib4, .-kernel_dsyrk_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_12x4_lib4
+ .type kernel_dtrmm_nn_rl_12x4_lib4, @function
+kernel_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_12x4_lib4
+_kernel_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_12x4_lib4
+ .def kernel_dtrmm_nn_rl_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_12x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_12x4_lib4, .-kernel_dtrmm_nn_rl_12x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrmm_nn_rl_12x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_12x4_vs_lib4
+ .type kernel_dtrmm_nn_rl_12x4_vs_lib4, @function
+kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_12x4_vs_lib4
+_kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_12x4_vs_lib4
+ .def kernel_dtrmm_nn_rl_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdb*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_12x4_vs_lib4, .-kernel_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_12x4_lib4
+ .type kernel_dtrmm_nt_ru_12x4_lib4, @function
+kernel_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_12x4_lib4
+_kernel_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_12x4_lib4
+ .def kernel_dtrmm_nt_ru_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+// call inner blend
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_12x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_12x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_12x4_lib4, .-kernel_dtrmm_nt_ru_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_12x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_12x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_12x4_vs_lib4
+_kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_12x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+// call inner blend
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_12x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_12x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_12x4_vs_lib4, .-kernel_dtrmm_nt_ru_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_12x4_lib4
+ .type kernel_dpotrf_nt_l_12x4_lib4, @function
+kernel_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_12x4_lib4
+_kernel_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_12x4_lib4
+ .def kernel_dpotrf_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_12x4_lib4, .-kernel_dpotrf_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_12x4_vs_lib4
+ .type kernel_dpotrf_nt_l_12x4_vs_lib4, @function
+kernel_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_12x4_vs_lib4
+_kernel_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_12x4_vs_lib4
+ .def kernel_dpotrf_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_12x4_vs_lib4, .-kernel_dpotrf_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_12x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_12x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_12x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x12_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // B
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 32*sdb
+ movq ARG2, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x12_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG3, %r11 // Bp
+ movq ARG4, %r12 // sdbp
+ sall $5, %r12d // 32*sdbp
+ movq ARG2, %r13 // Ap
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG7, %r11 // Bm
+ movq ARG8, %r12 // sdbm
+ sall $5, %r12d // 32*sdbm
+ movq ARG6, %r13 // Am
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG11, %r10 // E
+ movq ARG12, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG13, %r12 // inv_diag_E
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // store address D
+ movq ARG14, %r11 // km
+ movq ARG15, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_12x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_12x4_lib4
+_kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_12x4_lib4, .-kernel_dtrsm_nt_rl_inv_12x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dtrsm_nt_rl_inv_4x12_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x12_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x12_lib4
+_kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG4, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x12_lib4, .-kernel_dtrsm_nt_rl_inv_4x12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_12x4_lib4
+ .type kernel_dtrsm_nt_rl_one_12x4_lib4, @function
+kernel_dtrsm_nt_rl_one_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_12x4_lib4
+_kernel_dtrsm_nt_rl_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_12x4_lib4
+ .def kernel_dtrsm_nt_rl_one_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_12x4_lib4, .-kernel_dtrsm_nt_rl_one_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_rl_one_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_12x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_12x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_12x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_12x4_lib4
+_kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_12x4_lib4, .-kernel_dtrsm_nt_ru_inv_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_12x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_12x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_12x4_lib4
+_kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_12x4_lib4, .-kernel_dtrsm_nn_ru_inv_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_12x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_12x4_lib4
+ .type kernel_dtrsm_nn_ll_one_12x4_lib4, @function
+kernel_dtrsm_nn_ll_one_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_12x4_lib4
+_kernel_dtrsm_nn_ll_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_12x4_lib4
+ .def kernel_dtrsm_nn_ll_one_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_12x4_lib4, .-kernel_dtrsm_nn_ll_one_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_12x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_12x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_12x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_12x4_lib4
+_kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_12x4_lib4, .-kernel_dtrsm_nn_lu_inv_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E), int km, int kn;
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_12X4_vs_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_12x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_l_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_12x4_lib4
+ .type kernel_dgetrf_nn_l_12x4_lib4, @function
+kernel_dgetrf_nn_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_12x4_lib4
+_kernel_dgetrf_nn_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_12x4_lib4
+ .def kernel_dgetrf_nn_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_12x4_lib4, .-kernel_dgetrf_nn_l_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_l_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_12x4_vs_lib4
+ .type kernel_dgetrf_nn_l_12x4_vs_lib4, @function
+kernel_dgetrf_nn_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_12x4_vs_lib4
+_kernel_dgetrf_nn_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_12x4_vs_lib4
+ .def kernel_dgetrf_nn_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_12x4_vs_lib4, .-kernel_dgetrf_nn_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_m_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_m_12x4_lib4
+ .type kernel_dgetrf_nn_m_12x4_lib4, @function
+kernel_dgetrf_nn_m_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_m_12x4_lib4
+_kernel_dgetrf_nn_m_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_m_12x4_lib4
+ .def kernel_dgetrf_nn_m_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_m_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $128, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_m_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_m_12x4_lib4, .-kernel_dgetrf_nn_m_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_m_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_m_12x4_vs_lib4
+ .type kernel_dgetrf_nn_m_12x4_vs_lib4, @function
+kernel_dgetrf_nn_m_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_m_12x4_vs_lib4
+_kernel_dgetrf_nn_m_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_m_12x4_vs_lib4
+ .def kernel_dgetrf_nn_m_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_m_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $128, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_m_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_m_12x4_vs_lib4, .-kernel_dgetrf_nn_m_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_r_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_r_12x4_lib4
+ .type kernel_dgetrf_nn_r_12x4_lib4, @function
+kernel_dgetrf_nn_r_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_r_12x4_lib4
+_kernel_dgetrf_nn_r_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_r_12x4_lib4
+ .def kernel_dgetrf_nn_r_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_r_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $256, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_r_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_r_12x4_lib4, .-kernel_dgetrf_nn_r_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_r_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_r_12x4_vs_lib4
+ .type kernel_dgetrf_nn_r_12x4_vs_lib4, @function
+kernel_dgetrf_nn_r_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_r_12x4_vs_lib4
+_kernel_dgetrf_nn_r_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_r_12x4_vs_lib4
+ .def kernel_dgetrf_nn_r_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_r_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $256, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_r_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_r_12x4_vs_lib4, .-kernel_dgetrf_nn_r_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dlarfb12_r_4_lib4(int kmax, double *pV, int sdd, double *pT, double *pD, double *pK, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb12_r_4_lib4
+ .type kernel_dlarfb12_r_4_lib4, @function
+kernel_dlarfb12_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb12_r_4_lib4
+_kernel_dlarfb12_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb12_r_4_lib4
+ .def kernel_dlarfb12_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb12_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+// vmovapd %ymm0, %ymm4
+// vmovapd %ymm0, %ymm5
+// vmovapd %ymm0, %ymm6
+// vmovapd %ymm0, %ymm7
+// vmovapd %ymm0, %ymm8
+// vmovapd %ymm0, %ymm9
+// vmovapd %ymm0, %ymm10
+// vmovapd %ymm0, %ymm11
+
+ movq ARG1, %r10 // k
+ movq ARG5, %r11 // D
+ movq ARG2, %r12 // V
+ movq ARG3, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vmovapd %ymm12, %ymm0
+ //
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vmovapd %ymm12, %ymm1
+ //
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vmovapd %ymm12, %ymm2
+ //
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vmovapd %ymm12, %ymm3
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+ //
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vmovapd %ymm12, %ymm4
+ //
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vmovapd %ymm12, %ymm5
+ //
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vmovapd %ymm12, %ymm6
+ //
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vmovapd %ymm12, %ymm7
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+ //
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vmovapd %ymm12, %ymm8
+ //
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vmovapd %ymm12, %ymm9
+ //
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vmovapd %ymm12, %ymm10
+ //
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vmovapd %ymm12, %ymm11
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ movq %r11, %r14
+ movq %r12, %r11
+ movq %r13, %r12
+ movq %r14, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_TRAN_12X4_LIB4
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+ INNER_TRAN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_12x4_lib4
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+ call inner_tran_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_12x4_lib4
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+ callq _inner_tran_12x4_lib4
+#endif
+#endif
+
+ movq ARG4, %r11 // T
+ movq $384, %r12 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+ //
+ vbroadcastsd 376(%r11, %r12, 2), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+ //
+ vbroadcastsd 368(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm10, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 2), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ //
+ vbroadcastsd 360(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm9, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 2), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ //
+ vbroadcastsd 352(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm8, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 2), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ //
+ vbroadcastsd 376(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm11
+ vbroadcastsd 344(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 312(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 280(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 248(%r11, %r12, 1), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ //
+ vbroadcastsd 368(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 304(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 272(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 240(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 208(%r11, %r12, 1), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ //
+ vbroadcastsd 360(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 264(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 232(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 200(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 168(%r11, %r12, 1), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ //
+ vbroadcastsd 352(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 224(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 192(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 160(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 128(%r11, %r12, 1), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ //
+ vbroadcastsd 376(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm11
+ vbroadcastsd 344(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm10
+ vbroadcastsd 312(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm9
+ vbroadcastsd 280(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm8
+ vbroadcastsd 248(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm7
+ vbroadcastsd 216(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 184(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 152(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 120(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ //
+ vbroadcastsd 368(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm11
+ vbroadcastsd 336(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm10
+ vbroadcastsd 304(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm9
+ vbroadcastsd 272(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm8
+ vbroadcastsd 240(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm7
+ vbroadcastsd 208(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 176(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 144(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ //
+ vbroadcastsd 360(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm11
+ vbroadcastsd 328(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm10
+ vbroadcastsd 296(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm9
+ vbroadcastsd 264(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm8
+ vbroadcastsd 232(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm7
+ vbroadcastsd 200(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 168(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 136(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ //
+ vbroadcastsd 352(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm11
+ vbroadcastsd 320(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm10
+ vbroadcastsd 288(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm9
+ vbroadcastsd 256(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm8
+ vbroadcastsd 224(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm7
+ vbroadcastsd 192(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 160(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 128(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+
+ movq ARG6, %r10 // K
+ movq ARG7, %r11 // km
+
+ cmpl $4, %r11d
+ jge 0f
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendvpd %ymm15, %ymm11, %ymm14, %ymm11
+ vblendvpd %ymm15, %ymm10, %ymm14, %ymm10
+ vblendvpd %ymm15, %ymm9, %ymm14, %ymm9
+ vblendvpd %ymm15, %ymm8, %ymm14, %ymm8
+ vblendvpd %ymm15, %ymm7, %ymm14, %ymm7
+ vblendvpd %ymm15, %ymm6, %ymm14, %ymm6
+ vblendvpd %ymm15, %ymm5, %ymm14, %ymm5
+ vblendvpd %ymm15, %ymm4, %ymm14, %ymm4
+ vblendvpd %ymm15, %ymm3, %ymm14, %ymm3
+ vblendvpd %ymm15, %ymm2, %ymm14, %ymm2
+ vblendvpd %ymm15, %ymm1, %ymm14, %ymm1
+ vblendvpd %ymm15, %ymm0, %ymm14, %ymm0
+
+0:
+ vmovapd %ymm11, 352(%r10)
+ vmovapd %ymm10, 320(%r10)
+ vmovapd %ymm9, 288(%r10)
+ vmovapd %ymm8, 256(%r10)
+ vmovapd %ymm7, 224(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm0, 0(%r10)
+
+ movq ARG1, %r10 // n
+ movq ARG6, %r11 // K
+ movq ARG2, %r12 // V
+ movq ARG3, %r13 // sdd
+ sall $5, %r13d
+ movq ARG5, %r14 // D
+
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vaddpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vaddpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vaddpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vaddpd %ymm12, %ymm4, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vaddpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vaddpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vaddpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vaddpd %ymm12, %ymm8, %ymm8
+ vbroadcastsd 288(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vaddpd %ymm12, %ymm9, %ymm9
+ vbroadcastsd 328(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vaddpd %ymm12, %ymm10, %ymm10
+ vbroadcastsd 368(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vaddpd %ymm12, %ymm11, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ subl $12, %r10d
+ addq $384, %r12
+ addq $384, %r14
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x12_lib4
+#endif
+#endif
+
+100:
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb12_r_4_lib4, .-kernel_dlarfb12_r_4_lib4
+#endif
+
+
+
+
+
+ // 1 2 3 4 5
+// void kernel_dlarfb4_r_12_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_12_lib4
+ .type kernel_dlarfb4_r_12_lib4, @function
+kernel_dlarfb4_r_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_12_lib4
+_kernel_dlarfb4_r_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_12_lib4
+ .def kernel_dlarfb4_r_12_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+// vmovapd %ymm0, %ymm4
+// vmovapd %ymm0, %ymm5
+// vmovapd %ymm0, %ymm6
+// vmovapd %ymm0, %ymm7
+// vmovapd %ymm0, %ymm8
+// vmovapd %ymm0, %ymm9
+// vmovapd %ymm0, %ymm10
+// vmovapd %ymm0, %ymm11
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 0(%r11, %r12, 2), %ymm8
+ //
+ vmovapd 32(%r11), %ymm1
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 32(%r11, %r12, 2), %ymm9
+ vbroadcastsd 32(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm1, %ymm0
+ vfmadd231pd %ymm13, %ymm5, %ymm4
+ vfmadd231pd %ymm13, %ymm9, %ymm8
+ //
+ vmovapd 64(%r11), %ymm2
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 64(%r11, %r12, 2), %ymm10
+ vbroadcastsd 64(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm0
+ vfmadd231pd %ymm13, %ymm6, %ymm4
+ vfmadd231pd %ymm13, %ymm10, %ymm8
+ vbroadcastsd 72(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm1
+ vfmadd231pd %ymm13, %ymm6, %ymm5
+ vfmadd231pd %ymm13, %ymm10, %ymm9
+ //
+ vmovapd 96(%r11), %ymm3
+ vmovapd 96(%r11, %r12, 1), %ymm7
+ vmovapd 96(%r11, %r12, 2), %ymm11
+ vbroadcastsd 96(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm0
+ vfmadd231pd %ymm13, %ymm7, %ymm4
+ vfmadd231pd %ymm13, %ymm11, %ymm8
+ vbroadcastsd 104(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm1
+ vfmadd231pd %ymm13, %ymm7, %ymm5
+ vfmadd231pd %ymm13, %ymm11, %ymm9
+ vbroadcastsd 112(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm2
+ vfmadd231pd %ymm13, %ymm7, %ymm6
+ vfmadd231pd %ymm13, %ymm11, %ymm10
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vfmadd231pd %ymm2, %ymm12, %ymm3
+ vfmadd231pd %ymm6, %ymm12, %ymm7
+ vfmadd231pd %ymm10, %ymm12, %ymm11
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm3
+ vfmadd231pd %ymm5, %ymm12, %ymm7
+ vfmadd231pd %ymm9, %ymm12, %ymm11
+ vbroadcastsd 72(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm2
+ vfmadd231pd %ymm5, %ymm12, %ymm6
+ vfmadd231pd %ymm9, %ymm12, %ymm10
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm3
+ vfmadd231pd %ymm4, %ymm12, %ymm7
+ vfmadd231pd %ymm8, %ymm12, %ymm11
+ vbroadcastsd 64(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm2
+ vfmadd231pd %ymm4, %ymm12, %ymm6
+ vfmadd231pd %ymm8, %ymm12, %ymm10
+ vbroadcastsd 32(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm1
+ vfmadd231pd %ymm4, %ymm12, %ymm5
+ vfmadd231pd %ymm8, %ymm12, %ymm9
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vaddpd %ymm15, %ymm8, %ymm15
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ vmovapd %ymm15, 0(%r12, %r13, 2)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vmovapd 32(%r12, %r13, 2), %ymm15
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vaddpd %ymm15, %ymm9, %ymm15
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ vmovapd %ymm15, 32(%r12, %r13, 2)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vmovapd 64(%r12, %r13, 2), %ymm15
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vaddpd %ymm15, %ymm10, %ymm15
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ vmovapd %ymm15, 64(%r12, %r13, 2)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vmovapd 96(%r12, %r13, 2), %ymm15
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vaddpd %ymm15, %ymm11, %ymm15
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+ vmovapd %ymm15, 96(%r12, %r13, 2)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_12x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_12_lib4, .-kernel_dlarfb4_r_12_lib4
+#endif
+
+
+
+
+
+// read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+LC04: // { 11.5 10.5 9.5 8.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1075904512
+ .long 0
+ .long 1076035584
+ .long 0
+ .long 1076166656
+ .long 0
+ .long 1076297728
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC05: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_4x4_lib4.S b/kernel/avx2/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..c9bf696
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9433 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $32, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm3
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $32, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 40(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 48(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 56(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 32(%r12)
+
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 80(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 88(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 64(%r12)
+
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd -24(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd -16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r11
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r14d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r14d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r14d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ addq $32, %r11
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm0, %ymm0, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm2, %ymm2, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+// vextractf128 $0x1, %ymm3, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+ #if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ cmpl $2, %r12d
+ vmulpd %ymm0, %ymm13, %ymm0
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r12d
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r12d
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm1
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm2
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm3
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm3
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm2, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+3:
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vblendpd $0x4, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x2, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+ movq ARG9, %r12 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_4_lib4
+ .type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_4_lib4
+ .def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ //
+ vmovapd 0(%r11), %ymm0
+ //
+ vmovapd 32(%r11), %ymm1
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm1, %ymm0
+ //
+ vmovapd 64(%r11), %ymm2
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm1
+ //
+ vmovapd 96(%r11), %ymm3
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm2
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vfmadd231pd %ymm2, %ymm12, %ymm3
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm3
+ vbroadcastsd 72(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm2
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm3
+ vbroadcastsd 64(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm2
+ vbroadcastsd 32(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm1
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm12
+ vmovapd %ymm12, 0(%r12)
+ //
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vaddpd %ymm12, %ymm1, %ymm12
+ vmovapd %ymm12, 32(%r12)
+ //
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vaddpd %ymm12, %ymm2, %ymm12
+ vmovapd %ymm12, 64(%r12)
+ //
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vaddpd %ymm12, %ymm3, %ymm12
+ vmovapd %ymm12, 96(%r12)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_8x4_lib4.S b/kernel/avx2/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..82a5a86
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,12995 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+// cmpl $3, %r10d
+
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ addq $32, %r11
+
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm3
+ vfmadd231pd %ymm9, %ymm14, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ subl $1, %r10d
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 40(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 48(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 56(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 80(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 88(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd -24(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd -16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vmovapd 0(%rax), %ymm14 // A1[0]
+ vbroadcastsd 0(%r13), %ymm13 // B[0]
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vfmadd231pd %ymm14, %ymm13, %ymm4
+ vbroadcastsd 32(%r13), %ymm13 // B[1]
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vfmadd231pd %ymm14, %ymm13, %ymm5
+ vbroadcastsd 64(%r13), %ymm13 // B[2]
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vfmadd231pd %ymm14, %ymm13, %ymm6
+ vbroadcastsd 96(%r13), %ymm13 // B[3]
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vfmadd231pd %ymm14, %ymm13, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $32, %rax // A1+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // A1 <- A0
+ addq %r11, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 32(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 64(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 96(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 104(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ addq $32, %r13
+ addq $32, %r15
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r15
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r11
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ addq $32, %r15
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ addq $32, %r11
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $32, %r15
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_vs_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_vs_lib4, .-inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ movq %r10, %r12 // C1 <- C0
+ addq %r11, %r12 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ jmp 3f
+
+0:
+
+ movq %rax, %rbx // C0
+ addq %r14, %rbx // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ jmp 3f
+
+0:
+
+ movq %rax, %rbx // C0
+ addq %r14, %rbx // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_4x8_lib4, @function
+inner_tran_scale_11_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_4x8_lib4, .-inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm0, %ymm0, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm2, %ymm2, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+// vextractf128 $0x1, %ymm3, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x8_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x8_lib4, .-inner_edge_dtrsm_rlt_inv_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x8_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ cmpl $6, %r13d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ cmpl $7, %r13d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ cmpl $8, %r13d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x8_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm1
+ vfnmadd231pd %ymm4, %ymm12, %ymm5
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm2
+ vfnmadd231pd %ymm4, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm2
+ vfnmadd231pd %ymm5, %ymm12, %ymm6
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm3
+ vfnmadd231pd %ymm4, %ymm12, %ymm7
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm3
+ vfnmadd231pd %ymm5, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm3
+ vfnmadd231pd %ymm6, %ymm12, %ymm7
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r12 // E1 <- E0
+ addq %r11, %r12 // E1 <- E0 + 4*sde*sizeof(double)
+
+ // left block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r12), %ymm14
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r12), %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r12), %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ vmovapd 96(%r12), %ymm14
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ addq $128, %r12
+
+
+ // right block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r12), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vpermpd $0x00, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+
+ vmovapd 32(%r12), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vpermpd $0x55, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vpermpd $0x55, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+
+ vmovapd 64(%r12), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vpermpd $0xaa, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vpermpd $0xaa, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vpermpd $0xaa, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r13 // E1 <- E0
+ addq %r11, %r13 // E1 <- E0 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ vmovapd 224(%r13), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r13), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r13), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r14 // E1 <- E0
+ addq %r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ cmpl $7, %r13d
+ jle 0f
+
+ vmovapd 224(%r14), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+0:
+ cmpl $6, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r14), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+1:
+ cmpl $5, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r14), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+2:
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm2, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r15)
+ vmovapd %ymm5, 32(%r15)
+ vmovapd %ymm6, 64(%r15)
+ vmovapd %ymm7, 96(%r15)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r15)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r15)
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r15)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r15)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovapd %ymm0,0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r15)
+ vmovapd %ymm5, 32(%r15)
+ vmovapd %ymm6, 64(%r15)
+ vmovapd %ymm7, 96(%r15)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm13
+#endif
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x4, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x2, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrmm_nn_rl_8x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_vs_lib4
+ .type kernel_dtrmm_nn_rl_8x4_vs_lib4, @function
+kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_vs_lib4
+_kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_vs_lib4
+ .def kernel_dtrmm_nn_rl_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdb*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_vs_lib4, .-kernel_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x8_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG4, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x8_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG3, %r11 // Bp
+ movq ARG4, %r12 // sdbp
+ sall $5, %r12d // 32*sdbp
+ movq ARG2, %r13 // Ap
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG7, %r11 // Bm
+ movq ARG8, %r12 // sdbm
+ sall $5, %r12d // 32*sdbm
+ movq ARG6, %r13 // Am
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG11, %r10 // E
+ movq ARG12, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG13, %r12 // inv_diag_E
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // store address D
+ movq ARG14, %r11 // km
+ movq ARG15, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dtrsm_nt_rl_inv_4x8_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x8_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x8_lib4
+_kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG4, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x8_lib4, .-kernel_dtrsm_nt_rl_inv_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ // epilogue
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_8_lib4
+ .type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_8_lib4
+ .def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+// vmovapd %ymm0, %ymm4
+// vmovapd %ymm0, %ymm5
+// vmovapd %ymm0, %ymm6
+// vmovapd %ymm0, %ymm7
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ //
+ vmovapd 32(%r11), %ymm1
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vbroadcastsd 32(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm1, %ymm0
+ vfmadd231pd %ymm13, %ymm5, %ymm4
+ //
+ vmovapd 64(%r11), %ymm2
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vbroadcastsd 64(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm0
+ vfmadd231pd %ymm13, %ymm6, %ymm4
+ vbroadcastsd 72(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm1
+ vfmadd231pd %ymm13, %ymm6, %ymm5
+ //
+ vmovapd 96(%r11), %ymm3
+ vmovapd 96(%r11, %r12, 1), %ymm7
+ vbroadcastsd 96(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm0
+ vfmadd231pd %ymm13, %ymm7, %ymm4
+ vbroadcastsd 104(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm1
+ vfmadd231pd %ymm13, %ymm7, %ymm5
+ vbroadcastsd 112(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm2
+ vfmadd231pd %ymm13, %ymm7, %ymm6
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vfmadd231pd %ymm2, %ymm12, %ymm3
+ vfmadd231pd %ymm6, %ymm12, %ymm7
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm3
+ vfmadd231pd %ymm5, %ymm12, %ymm7
+ vbroadcastsd 72(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm2
+ vfmadd231pd %ymm5, %ymm12, %ymm6
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm3
+ vfmadd231pd %ymm4, %ymm12, %ymm7
+ vbroadcastsd 64(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm2
+ vfmadd231pd %ymm4, %ymm12, %ymm6
+ vbroadcastsd 32(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm1
+ vfmadd231pd %ymm4, %ymm12, %ymm5
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_8x8_lib4.S b/kernel/avx2/kernel_dgemm_8x8_lib4.S
new file mode 100644
index 0000000..954c96d
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_8x8_lib4.S
@@ -0,0 +1,5625 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define ARG19 STACKSIZE + 104(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define ARG19 STACKSIZE + 152(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x8_lib4, @function
+inner_kernel_dgemm_add_nt_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 0(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 0(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+// vmovapd 0(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+// vbroadcastsd 0(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+// vmovapd 0(%r11, %r12, 1), %ymm13
+// vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ subl $1, %r10d
+
+ vbroadcastsd 8(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ addq $32, %r13
+
+ vbroadcastsd -8(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd -8(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x8_lib4, .-inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x8_lib4, @function
+inner_kernel_dgemm_sub_nt_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 0(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 0(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+// vmovapd 0(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+// vbroadcastsd 0(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+// vmovapd 0(%r11, %r12, 1), %ymm13
+// vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ subl $1, %r10d
+
+ vbroadcastsd 8(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ addq $32, %r13
+
+ vbroadcastsd -8(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd -8(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x8_lib4, .-inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib4, @function
+inner_scale_ab_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib4:
+#endif
+#endif
+
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_8x8_lib4, @function
+inner_tran_scale_ab_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_8x8_lib4:
+#endif
+#endif
+
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_8x8_lib4, .-inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x8_lib4, @function
+inner_scale_11_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x8_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x8_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x8_lib4, .-inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_8x8_lib4, @function
+inner_tran_scale_11_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_8x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_8x8_lib4:
+#endif
+#endif
+
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_8x8_lib4, .-inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x8_vs_lib4, @function
+inner_edge_dpotrf_8x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x8_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x8_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+
+ vmovsd %xmm8, %xmm8, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 9f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+10:
+ vmovsd %xmm13, 32(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $6, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm8, %ymm8, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm8, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm8, %ymm8, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vpermilpd $0x3, %xmm9, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 11f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+12:
+ vmovsd %xmm13, 40(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm9, %ymm9, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vextractf128 $0x1, %ymm10, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 13f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+14:
+ vmovsd %xmm13, 48(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $8, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm10, %ymm10, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm10, %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+// vextractf128 $0x1, %ymm11, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm11, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+16:
+ vmovsd %xmm13, 56(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x8_vs_lib4, .-inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8l_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8l_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8l_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8u_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8u_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm12
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vmovapd 32(%r13, %r14, 1), %ymm12
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+ vmovapd 64(%r13, %r14, 1), %ymm12
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ vmovapd 96(%r13, %r14, 1), %ymm12
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8u_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $6, %r13d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $7, %r13d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $8, %r13d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// r15d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// r15d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm12
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vmovapd 32(%r13, %r14, 1), %ymm12
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+ vmovapd 64(%r13, %r14, 1), %ymm12
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ vmovapd 96(%r13, %r14, 1), %ymm12
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $6, %r15d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $7, %r15d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $8, %r15d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+
+
+// subq $128, %r10
+// vmovapd 0(%r10, %r11, 1), %ymm4
+// vmovapd 32(%r10, %r11, 1), %ymm5
+// vmovapd 64(%r10, %r11, 1), %ymm6
+// vmovapd 96(%r10, %r11, 1), %ymm7
+
+
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8L_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8l_lib4, @function
+inner_store_8x8l_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8l_lib4; .scl 2; .type 32; .endef
+inner_store_8x8l_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 128(%r10, %r11, 1)
+ vmovapd %ymm9, 160(%r10, %r11, 1)
+ vmovapd %ymm10, 192(%r10, %r11, 1)
+ vmovapd %ymm11, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8l_lib4, .-inner_store_8x8l_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8U_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8u_lib4, @function
+inner_store_8x8u_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8u_lib4; .scl 2; .type 32; .endef
+inner_store_8x8u_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+ vmovapd %ymm8, 128(%r10, %r11, 1)
+ vmovapd %ymm9, 160(%r10, %r11, 1)
+ vmovapd %ymm10, 192(%r10, %r11, 1)
+ vmovapd %ymm11, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8u_lib4, .-inner_store_8x8u_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8L_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8l_vs_lib4, @function
+inner_store_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8l_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x8l_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
+ cmpl $6, %r13d
+ jl 0f // end
+ vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
+ cmpl $7, %r13d
+ jl 0f // end
+ vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
+ je 0f // end
+ vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8l_vs_lib4, .-inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8U_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8u_vs_lib4, @function
+inner_store_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8u_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x8u_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+
+ vmovapd %ymm4, 128(%r10)
+ vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
+ cmpl $6, %r13d
+ jl 0f // end
+ vmovapd %ymm5, 160(%r10)
+ vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
+ cmpl $7, %r13d
+ jl 0f // end
+ vmovapd %ymm6, 192(%r10)
+ vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
+ je 0f // end
+ vmovapd %ymm7, 224(%r10)
+ vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8u_vs_lib4, .-inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib4, @function
+inner_store_l_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 128(%r10, %r11, 1)
+ vmovapd 160(%r10, %r11, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vmovapd %ymm9, 160(%r10, %r11, 1)
+ vmovapd 192(%r10, %r11, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm10, %ymm10
+ vmovapd %ymm10, 192(%r10, %r11, 1)
+ vmovapd 224(%r10, %r11, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm11, %ymm11
+ vmovapd %ymm11, 224(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib4, .-inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib4, @function
+inner_store_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
+ cmpl $6, %r13d
+ jl 0f // end
+ vmovapd 160(%r10, %r11, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
+ cmpl $7, %r13d
+ jl 0f // end
+ vmovapd 192(%r10, %r11, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm10, %ymm10
+ vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 224(%r10, %r11, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm11, %ymm11
+ vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_vs_lib4, .-inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib4, @function
+inner_store_8x8_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ vmovapd %ymm8, %ymm7
+ vmovapd %ymm9, %ymm8
+ vmovapd %ymm10, %ymm9
+ vmovapd %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm7, %ymm6
+ vmovapd %ymm8, %ymm7
+ vmovapd %ymm9, %ymm8
+ vmovapd %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm7, %ymm6
+ vmovapd %ymm8, %ymm7
+ vmovapd %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r11, %r12, 1)
+ cmpl $6, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm9, %ymm15, 160(%r11, %r12, 1)
+ cmpl $7, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm10, %ymm15, 192(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm11, %ymm15, 224(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm8, %ymm8, %ymm12
+ vshufpd $0x5, %ymm8, %ymm12, %ymm8
+
+ vperm2f128 $0x01, %ymm9, %ymm9, %ymm12
+ vshufpd $0x5, %ymm9, %ymm12, %ymm9
+
+ vperm2f128 $0x01, %ymm10, %ymm10, %ymm12
+ vshufpd $0x5, %ymm10, %ymm12, %ymm10
+
+ vperm2f128 $0x01, %ymm11, %ymm11, %ymm12
+ vshufpd $0x5, %ymm11, %ymm12, %ymm11
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC08(%rip), %ymm14, %ymm12
+ vandpd .LC05(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+ vandpd LC08(%rip), %ymm14, %ymm12
+ vandpd LC05(%rip), %ymm15, %ymm13
+#endif
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC08(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+ vandpd LC08(%rip), %ymm15, %ymm15
+#endif
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm8, %ymm8, %ymm8
+
+ vperm2f128 $0x01, %ymm9, %ymm9, %ymm9
+
+ vperm2f128 $0x01, %ymm10, %ymm10, %ymm10
+
+ vperm2f128 $0x01, %ymm11, %ymm11, %ymm11
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC09(%rip), %ymm14, %ymm12
+ vandpd .LC06(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+ vandpd LC09(%rip), %ymm14, %ymm12
+ vandpd LC06(%rip), %ymm15, %ymm13
+#endif
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC09(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+ vandpd LC09(%rip), %ymm15, %ymm15
+#endif
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm8, %ymm8, %ymm12
+ vshufpd $0x5, %ymm12, %ymm8, %ymm8
+
+ vperm2f128 $0x01, %ymm9, %ymm9, %ymm12
+ vshufpd $0x5, %ymm12, %ymm9, %ymm9
+
+ vperm2f128 $0x01, %ymm10, %ymm10, %ymm12
+ vshufpd $0x5, %ymm12, %ymm10, %ymm10
+
+ vperm2f128 $0x01, %ymm11, %ymm11, %ymm12
+ vshufpd $0x5, %ymm12, %ymm11, %ymm11
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC10(%rip), %ymm14, %ymm12
+ vandpd .LC07(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+ vandpd LC10(%rip), %ymm14, %ymm12
+ vandpd LC07(%rip), %ymm15, %ymm13
+#endif
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC10(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+ vandpd LC10(%rip), %ymm15, %ymm15
+#endif
+
+3:
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r11, %r12, 1)
+ vmaskmovpd %ymm8, %ymm13, 128(%r11, %r12, 2)
+ cmpl $6, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm9, %ymm15, 160(%r11, %r12, 1)
+ vmaskmovpd %ymm9, %ymm13, 160(%r11, %r12, 2)
+ cmpl $7, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm10, %ymm15, 192(%r11, %r12, 1)
+ vmaskmovpd %ymm10, %ymm13, 192(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm11, %ymm15, 224(%r11, %r12, 1)
+ vmaskmovpd %ymm11, %ymm13, 224(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib4, .-inner_store_8x8_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemm_nt_8x8l_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8l_lib4
+ .type kernel_dgemm_nt_8x8l_lib4, @function
+kernel_dgemm_nt_8x8l_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8l_lib4
+_kernel_dgemm_nt_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8l_lib4
+ .def kernel_dgemm_nt_8x8l_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8l_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8l_lib4, .-kernel_dgemm_nt_8x8l_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemm_nt_8x8u_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8u_lib4
+ .type kernel_dgemm_nt_8x8u_lib4, @function
+kernel_dgemm_nt_8x8u_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8u_lib4
+_kernel_dgemm_nt_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8u_lib4
+ .def kernel_dgemm_nt_8x8u_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8u_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG5, %r11 // B
+ movq ARG6, %r12 // sdb
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG3, %r13 // A
+ movq ARG4, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8u_lib4, .-kernel_dgemm_nt_8x8u_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_dgemm_nt_8x8l_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8l_vs_lib4
+ .type kernel_dgemm_nt_8x8l_vs_lib4, @function
+kernel_dgemm_nt_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8l_vs_lib4
+_kernel_dgemm_nt_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8l_vs_lib4
+ .def kernel_dgemm_nt_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8l_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8l_vs_lib4, .-kernel_dgemm_nt_8x8l_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_dgemm_nt_8x8u_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8u_vs_lib4
+ .type kernel_dgemm_nt_8x8u_vs_lib4, @function
+kernel_dgemm_nt_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8u_vs_lib4
+_kernel_dgemm_nt_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8u_vs_lib4
+ .def kernel_dgemm_nt_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8u_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG5, %r11 // B
+ movq ARG6, %r12 // sdb
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG3, %r13 // A
+ movq ARG4, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8u_vs_lib4, .-kernel_dgemm_nt_8x8u_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
+// void kernel_dgemm_nt_8x8_gen_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8_gen_lib4
+ .type kernel_dgemm_nt_8x8_gen_lib4, @function
+kernel_dgemm_nt_8x8_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8_gen_lib4
+_kernel_dgemm_nt_8x8_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8_gen_lib4
+ .def kernel_dgemm_nt_8x8_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8_lib4, .-kernel_dgemm_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dsyrk_nt_8x8_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x8_lib4
+ .type kernel_dsyrk_nt_l_8x8_lib4, @function
+kernel_dsyrk_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x8_lib4
+_kernel_dsyrk_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x8_lib4
+ .def kernel_dsyrk_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x8_lib4, .-kernel_dsyrk_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_dsyrk_nt_8x8_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x8_vs_lib4
+ .type kernel_dsyrk_nt_l_8x8_vs_lib4, @function
+kernel_dsyrk_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x8_vs_lib4
+_kernel_dsyrk_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x8_vs_lib4
+ .def kernel_dsyrk_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG12, %r12 // D
+ movq ARG13, %r13 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x8_vs_lib4, .-kernel_dsyrk_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dpotrf_nt_l_8x8_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x8_lib4
+ .type kernel_dpotrf_nt_l_8x8_lib4, @function
+kernel_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x8_lib4
+_kernel_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x8_lib4
+ .def kernel_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x8_lib4, .-kernel_dpotrf_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dpotrf_nt_l_8x8_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x8_vs_lib4
+ .type kernel_dpotrf_nt_l_8x8_vs_lib4, @function
+kernel_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x8_vs_lib4
+_kernel_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x8_vs_lib4
+ .def kernel_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+ movq ARG12, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dpotrf_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dsyrk_dpotrf_nt_l_8x8_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x8_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+ movq ARG5, %r14 // sdbp
+ sall $5, %r14d // 4*sdbp*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ movq ARG6, %r10 // km
+ movq ARG7, %r11 // Am
+ movq ARG8, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG9, %r13 // Bm
+ movq ARG10, %r14 // sdbm
+ sall $5, %r14d // 4*sdbm*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10 // C
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG15, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x8_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
+// void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+ movq ARG5, %r14 // sdbp
+ sall $5, %r14d // 4*sdbp*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG6, %r10 // km
+ movq ARG7, %r11 // Am
+ movq ARG8, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG9, %r13 // Bm
+ movq ARG10, %r14 // sdbm
+ sall $5, %r14d // 4*sdbm*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10 // C
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG15, %r10 // inv_diag_D
+ movq ARG17, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG16, %r12 // km
+ movq ARG17, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dtrsm_nt_rl_inv_8x8l_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8l_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8l_lib4
+_kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8l_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+ movq ARG5, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8l_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dtrsm_nt_rl_inv_8x8u_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8u_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8u_lib4
+_kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8u_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG4, %r11
+ movq ARG5, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+ movq ARG3, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG8, %r13 // D
+ movq ARG9, %r14 // sdd
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8u_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+ movq ARG5, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG4, %r11
+ movq ARG5, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+ movq ARG3, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG8, %r13 // D
+ movq ARG9, %r14 // sdd
+ sall $5, %r14d // 4*sdc*sizeof(double)
+ movq ARG14, %r15 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+ movq ARG5, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ movq ARG6, %r10
+ movq ARG7, %r11
+ movq ARG8, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG9, %r13
+ movq ARG10, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG15, %r10 // E
+ movq ARG16, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG17, %r12 // inv_diag_E
+ movq ARG19, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG18, %r12 // km
+ movq ARG19, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG4, %r11
+ movq ARG5, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+ movq ARG3, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ movq ARG6, %r10
+ movq ARG9, %r11
+ movq ARG10, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG7, %r13
+ movq ARG8, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10 // C
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG15, %r10 // E
+ movq ARG16, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG17, %r12 // inv_diag_E
+ movq ARG13, %r13 // D
+ movq ARG14, %r14 // sdd
+ sall $5, %r14d // 4*sdc*sizeof(double)
+ movq ARG19, %r15 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG18, %r12 // km
+ movq ARG19, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemv_8_lib4.S b/kernel/avx2/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..1c9185a
--- /dev/null
+++ b/kernel/avx2/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1543 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+ vmovapd 64(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ addq $32, %r11
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ // first 4 columns
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+
+ // last 4 columns
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+ vmovapd 64(%r15), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vbroadcastsd 24(%r13), %ymm12
+ vmovapd 96(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+ vmovapd 96(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+ vaddpd %ymm4, %ymm6, %ymm4
+ vaddpd %ymm5, %ymm7, %ymm5
+ vaddpd %ymm0, %ymm4, %ymm0
+ vaddpd %ymm1, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+ vaddpd %ymm4, %ymm6, %ymm4
+ vaddpd %ymm5, %ymm7, %ymm5
+ vaddpd %ymm0, %ymm4, %ymm0
+ vaddpd %ymm1, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm0 <- [z4a z4b z4c z4d]
+// ymm1 <- [z5a z5b z5c z5d]
+// ymm2 <- [z6a z6b z6c z6d]
+// ymm3 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_8_lib4
+ .type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_8_lib4
+ .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_8_lib4
+ .type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_8_lib4
+ .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_8_lib4
+ .type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_8_lib4
+ .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+#if MACRO_LEVEL>=1
+ INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..b1329fe
--- /dev/null
+++ b/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1435 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+
+
+ // third column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+
+
+ // fourth column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>1)
+ {
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+ }
+
+ if(n==2)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // third column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>2)
+ {
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n==3)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // fourth column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>3)
+ {
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
diff --git a/kernel/avx2/kernel_dsymv_6_lib4.S b/kernel/avx2/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..7a4411c
--- /dev/null
+++ b/kernel/avx2/kernel_dsymv_6_lib4.S
@@ -0,0 +1,996 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm6, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm7, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm8, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm9, %ymm13
+
+ vmovapd 128(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm14, %ymm10, %ymm13
+
+ vmovapd 160(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm14, %ymm11, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm15
+
+ vmaskmovpd 0(%r13), %ymm15, %ymm12
+ vmaskmovpd 0(%r14), %ymm15, %ymm13
+
+ vmaskmovpd 0(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm6, %ymm13
+
+ vmaskmovpd 32(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm7, %ymm13
+
+ vmaskmovpd 64(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm8, %ymm13
+
+ vmaskmovpd 96(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm9, %ymm13
+
+ vmaskmovpd 128(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm14, %ymm10, %ymm13
+
+ vmaskmovpd 160(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm14, %ymm11, %ymm13
+
+ vmaskmovpd %ymm13, %ymm15, 0(%r14)
+
+ sall $3, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm5, %ymm4, %ymm4
+// vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vextractf128 $0x1, %ymm4, %xmm5
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm4
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm4, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmovupd 32(%r12), %ymm13
+ vfmadd231pd %ymm15, %ymm14, %ymm0
+ vfmadd231pd %ymm15, %ymm13, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %xmm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp_32 rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_6_lib4
+ .type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_6_lib4
+ .def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+ vbroadcastsd 32(%r10), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vbroadcastsd 40(%r10), %ymm11
+ vmulpd %ymm15, %ymm11, %ymm11
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_6_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx2/kernel_sgemm_16x4_lib8.S b/kernel/avx2/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..857fb11
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,6811 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmovaps 32(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ vmovaps 32(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 1
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+ // unroll 2
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r13
+ vmovaps 96(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r11
+ vmovaps 96(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r15
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 3
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+ vmovaps 0(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+ vmovaps 0(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmovaps 32(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ vmovaps 32(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 1
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+ // unroll 2
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r13
+ vmovaps 96(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r11
+ vmovaps 96(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r15
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 3
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+// vmovaps 0(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+// vmovaps 0(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $8, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -4(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq %r14, %r13
+
+ cmpl $8, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $7, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 124(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq %r14, %r13
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 64(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 68(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vbroadcastss 8(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vbroadcastss 12(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vbroadcastss 44(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+
+ vmovaps 0(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vmovaps 32(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vmovaps 64(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vmovaps 96(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // C1 <- C0
+ addq %r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 7f // end
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 7f // end
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 7f // end
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+ jmp 0f
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%rbx)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%rbx)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%rbx)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%rbx)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbp // D1
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .def kernel_sgemm_nt_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .def kernel_ssyrk_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .def kernel_ssyrk_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L12X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdd*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_24x4_lib8.S b/kernel/avx2/kernel_sgemm_24x4_lib8.S
new file mode 100644
index 0000000..b3a027f
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_24x4_lib8.S
@@ -0,0 +1,7734 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_24x4_lib8, @function
+inner_kernel_gemm_add_nt_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_24x4_lib8:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+ vmovaps 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+ movq %r15, %rax // A2 <- A1
+ addq %r12, %rax // A2 <- A1 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm13 // A0
+ vmovaps 0(%r15), %ymm14 // A1
+ vmovaps 0(%rax), %ymm15 // A2
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 32(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 32(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 32(%r13), %ymm12 // B
+ vmovaps 32(%rax), %ymm15 // A2
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 64(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 64(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vmovaps 64(%rax), %ymm15 // A2
+
+
+ // unroll 2
+ subl $4, %r10d
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 96(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 96(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 96(%r13), %ymm12 // B
+ vmovaps 96(%rax), %ymm15 // A2
+
+
+ // unroll 3
+ addq $128, %r13
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %r11
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ addq $128, %r15
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %rax
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 0(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 0(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%rax), %ymm15 // A2
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 32(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 32(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 32(%r13), %ymm12 // B
+ vmovaps 32(%rax), %ymm15 // A2
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 64(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 64(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vmovaps 64(%rax), %ymm15 // A2
+
+
+ // unroll 2
+ subl $4, %r10d
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 96(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 96(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 96(%r13), %ymm12 // B
+ vmovaps 96(%rax), %ymm15 // A2
+
+
+ // unroll 3
+ addq $128, %r13
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %r11
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ addq $128, %r15
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %rax
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+// vmovaps 0(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+// vmovaps 0(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+// vbroadcastf128 0(%r13), %ymm12 // B
+// vmovaps 0(%rax), %ymm15 // A2
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 32(%r13), %ymm12 // B
+ vmovaps 32(%r11), %ymm13 // A0
+ vmovaps 32(%r15), %ymm14 // A1
+ vmovaps 32(%rax), %ymm15 // A2
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ subl $1, %r10d
+
+ vpermilps $0xb1, %ymm12, %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vpermilps $0x4e, %ymm12, %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ addq $32, %r13
+
+ vpermilps $0xb1, %ymm12, %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ addq $32, %r15
+
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_24x4_lib8, .-inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_24x4_lib8, @function
+inner_kernel_gemm_sub_nt_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+ vmovaps 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_24x4_lib8, .-inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_24x4_lib8, @function
+inner_kernel_gemm_add_nn_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 0(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A0
+ vmovaps 32(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 32(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 4(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 36(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 68(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 100(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A0
+ vmovaps 64(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 64(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 8(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 40(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 72(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 104(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A0
+ vmovaps 96(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 96(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 12(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 44(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 76(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 108(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A0
+ vmovaps 128(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 128(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 16(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 48(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 80(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 112(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A0
+ vmovaps 160(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 160(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 20(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 52(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 84(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 116(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A0
+ vmovaps 192(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 192(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 24(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 56(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ subl $8, %r10d
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 88(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vbroadcastss 120(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A0
+ vmovaps 224(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 224(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 28(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 60(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ addq $256, %r11
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 92(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 124(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+ addq %r14, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 0(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_24x4_lib8, .-inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_24x4_lib8, @function
+inner_edge_gemm_add_nn_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_24x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 0(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_24x4_lib8, .-inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_24x4_lib8, @function
+inner_edge_trmm_nn_rl_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_24x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 64(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 68(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_24x4_lib8, .-inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_24x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ vmulps %ymm8, %ymm13, %ymm8
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vfnmadd231ps %ymm8, %ymm13, %ymm9
+ vbroadcastss 8(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vfnmadd231ps %ymm8, %ymm13, %ymm10
+ vbroadcastss 12(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+ vfnmadd231ps %ymm8, %ymm13, %ymm11
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ vmulps %ymm9, %ymm13, %ymm9
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vfnmadd231ps %ymm9, %ymm13, %ymm10
+ vbroadcastss 44(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+ vfnmadd231ps %ymm9, %ymm13, %ymm11
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ vmulps %ymm10, %ymm13, %ymm10
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+ vfnmadd231ps %ymm10, %ymm13, %ymm11
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+ vmulps %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_24x4_vs_lib8, .-inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_24x4_vs_lib8, @function
+inner_edge_potrf_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_24x4_vs_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ vmulps %ymm8, %ymm13, %ymm8
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm15
+ vpermilps $0x55, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vfnmadd231ps %ymm8, %ymm13, %ymm9
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vfnmadd231ps %ymm8, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+ vfnmadd231ps %ymm8, %ymm13, %ymm11
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ vmulps %ymm9, %ymm13, %ymm9
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm15
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vfnmadd231ps %ymm9, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+ vfnmadd231ps %ymm9, %ymm13, %ymm11
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ vmulps %ymm10, %ymm13, %ymm10
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm15
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+ vfnmadd231ps %ymm10, %ymm13, %ymm11
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+ vmulps %ymm11, %ymm13, %ymm11
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_24x4_vs_lib8, .-inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_20x4_vs_lib8, @function
+inner_edge_potrf_20x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_20x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_20x4_vs_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ vmulps %ymm8, %ymm13, %ymm8
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm15
+ vpermilps $0x55, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vfnmadd231ps %ymm8, %ymm13, %ymm9
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vfnmadd231ps %ymm8, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+ vfnmadd231ps %ymm8, %ymm13, %ymm11
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ vmulps %ymm9, %ymm13, %ymm9
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm15
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vfnmadd231ps %ymm9, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+ vfnmadd231ps %ymm9, %ymm13, %ymm11
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ vmulps %ymm10, %ymm13, %ymm10
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm15
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+ vfnmadd231ps %ymm10, %ymm13, %ymm11
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+ vmulps %ymm11, %ymm13, %ymm11
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_20x4_vs_lib8, .-inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_24x4_lib8, @function
+inner_scale_ab_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_24x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_24x4_lib8, .-inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_24x4_gen_lib8, @function
+inner_scale_ab_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_24x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %r15 // C1 <- C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r14, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_24x4_gen_lib8, .-inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_24x4_lib8, @function
+inner_scale_a0_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_24x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm12
+
+ vmulps %ymm0, %ymm12, %ymm0
+ vmulps %ymm1, %ymm12, %ymm1
+ vmulps %ymm2, %ymm12, %ymm2
+ vmulps %ymm3, %ymm12, %ymm3
+
+ vmulps %ymm4, %ymm12, %ymm4
+ vmulps %ymm5, %ymm12, %ymm5
+ vmulps %ymm6, %ymm12, %ymm6
+ vmulps %ymm7, %ymm12, %ymm7
+
+ vmulps %ymm8, %ymm12, %ymm8
+ vmulps %ymm9, %ymm12, %ymm9
+ vmulps %ymm10, %ymm12, %ymm10
+ vmulps %ymm11, %ymm12, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_24x4_lib8, .-inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_24x4_lib8, @function
+inner_scale_11_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_24x4_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_24x4_lib8, .-inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_24x4_gen_lib8, @function
+inner_scale_11_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_24x4_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_24x4_gen_lib8, .-inner_scale_11_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_24x4_lib8, @function
+inner_blend_scale_ab_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_24x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_24x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r13, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_24x4_lib8, .-inner_blend_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_24x4_gen_lib8, @function
+inner_blend_scale_ab_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_24x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %r15 // C1 <- C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r14, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_24x4_gen_lib8, .-inner_blend_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_24x4_lib8, @function
+inner_blend_scale_11_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_24x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_24x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r11, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vaddps %ymm15, %ymm8, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vaddps %ymm15, %ymm9, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vaddps %ymm15, %ymm10, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vaddps %ymm15, %ymm11, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_24x4_lib8, .-inner_blend_scale_11_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_24x4_gen_lib8, @function
+inner_blend_scale_11_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_24x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ movq %r11, %r15 // C1 <- C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r12, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmovaps 32(%r11), %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmovaps 64(%r11), %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmovaps 96(%r11), %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vaddps %ymm15, %ymm8, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vaddps %ymm15, %ymm9, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vaddps %ymm15, %ymm10, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vaddps %ymm15, %ymm11, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_24x4_gen_lib8, .-inner_blend_scale_11_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_24x4_lib8, @function
+inner_store_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_24x4_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+ movq %r15, %rax // D2 <- D1
+ addq %r11, %rax // D2 <- D1 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+ vmovaps %ymm8, 0(%rax)
+ vmovaps %ymm9, 32(%rax)
+ vmovaps %ymm10, 64(%rax)
+ vmovaps %ymm11, 96(%rax)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_24x4_lib8, .-inner_store_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_24x4_vs_lib8, @function
+inner_store_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_24x4_vs_lib8, .-inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_24x4_gen_lib8, @function
+inner_store_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ vmovaps %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D1
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+ movq %rbx, %rbp // D2
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmovaps %ymm4, 0(%rbx)
+ vmaskmovps %ymm8, %ymm15, 0(%rbp)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmovaps %ymm5, 32(%rbx)
+ vmaskmovps %ymm9, %ymm15, 32(%rbp)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmovaps %ymm6, 64(%rbx)
+ vmaskmovps %ymm10, %ymm15, 64(%rbp)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmovaps %ymm7, 96(%rbx)
+ vmaskmovps %ymm11, %ymm15, 96(%rbp)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+// movq %r11, %rbp // D1
+// addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+ addq %rbp, %r12 // D3 <- D2 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_24x4_gen_lib8, .-inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_20X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_20x4_lib8, @function
+inner_store_l_20x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_20x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+ vmovaps %ymm8, 0(%r10, %r11, 2)
+ vmovaps %ymm9, 32(%r10, %r11, 2)
+ vmovaps %ymm10, 64(%r10, %r11, 2)
+ vmovaps %ymm11, 96(%r10, %r11, 2)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_20x4_lib8, .-inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_24x4_lib8, @function
+inner_store_l_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_24x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+ vmovaps %ymm8, 0(%r10, %r11, 2)
+ vmovaps %ymm9, 32(%r10, %r11, 2)
+ vmovaps %ymm10, 64(%r10, %r11, 2)
+ vmovaps %ymm11, 96(%r10, %r11, 2)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_24x4_lib8, .-inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_20X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_20x4_vs_lib8, @function
+inner_store_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_20x4_vs_lib8, .-inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_24x4_vs_lib8, @function
+inner_store_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_24x4_vs_lib8, .-inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_20X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_20x4_gen_lib8, @function
+inner_store_l_20x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_20x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ vmovaps %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmovaps %ymm4, 0(%r11, %r12, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmovaps %ymm5, 32(%r11, %r12, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmovaps %ymm6, 64(%r11, %r12, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r11, %r12, 2)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmovaps %ymm7, 96(%r11, %r12, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r11, %r12, 2)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_20x4_gen_lib8, .-inner_store_l_20x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_24x4_gen_lib8, @function
+inner_store_l_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ vmovaps %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmovaps %ymm4, 0(%r11, %r12, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmovaps %ymm5, 32(%r11, %r12, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmovaps %ymm6, 64(%r11, %r12, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r11, %r12, 2)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmovaps %ymm7, 96(%r11, %r12, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r11, %r12, 2)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_24x4_gen_lib8, .-inner_store_l_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_nt_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_24x4_lib8
+ .type kernel_sgemm_nt_24x4_lib8, @function
+kernel_sgemm_nt_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_24x4_lib8
+_kernel_sgemm_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_24x4_lib8
+ .def kernel_sgemm_nt_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_24x4_lib8, .-kernel_sgemm_nt_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_nt_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_24x4_vs_lib8
+ .type kernel_sgemm_nt_24x4_vs_lib8, @function
+kernel_sgemm_nt_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_24x4_vs_lib8
+_kernel_sgemm_nt_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_24x4_vs_lib8
+ .def kernel_sgemm_nt_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_24x4_vs_lib8, .-kernel_sgemm_nt_24x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_24x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_24x4_gen_lib8
+ .type kernel_sgemm_nt_24x4_gen_lib8, @function
+kernel_sgemm_nt_24x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_24x4_gen_lib8
+_kernel_sgemm_nt_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_24x4_gen_lib8
+ .def kernel_sgemm_nt_24x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_24x4_gen_lib8, .-kernel_sgemm_nt_24x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_nn_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_24x4_lib8
+ .type kernel_sgemm_nn_24x4_lib8, @function
+kernel_sgemm_nn_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_24x4_lib8
+_kernel_sgemm_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_24x4_lib8
+ .def kernel_sgemm_nn_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_24x4_lib8, .-kernel_sgemm_nn_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_24x4_vs_lib8
+ .type kernel_sgemm_nn_24x4_vs_lib8, @function
+kernel_sgemm_nn_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_24x4_vs_lib8
+_kernel_sgemm_nn_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_24x4_vs_lib8
+ .def kernel_sgemm_nn_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_24x4_vs_lib8, .-kernel_sgemm_nn_24x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_24x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_24x4_gen_lib8
+ .type kernel_sgemm_nn_24x4_gen_lib8, @function
+kernel_sgemm_nn_24x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_24x4_gen_lib8
+_kernel_sgemm_nn_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_24x4_gen_lib8
+ .def kernel_sgemm_nn_24x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_24x4_gen_lib8, .-kernel_sgemm_nn_24x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_24x4_lib8
+ .type kernel_strsm_nt_rl_inv_24x4_lib8, @function
+kernel_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_24x4_lib8
+_kernel_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_24x4_lib8
+ .def kernel_strsm_nt_rl_inv_24x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_24x4_lib8, .-kernel_strsm_nt_rl_inv_24x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_24x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_24x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_24x4_vs_lib8
+_kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_24x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_24x4_vs_lib8, .-kernel_strsm_nt_rl_inv_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_24x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_24x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_20x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_20x4_lib8
+ .type kernel_spotrf_nt_l_20x4_lib8, @function
+kernel_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_20x4_lib8
+_kernel_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_20x4_lib8
+ .def kernel_spotrf_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_20x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_20x4_lib8, .-kernel_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_20x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_20x4_vs_lib8
+ .type kernel_spotrf_nt_l_20x4_vs_lib8, @function
+kernel_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_20x4_vs_lib8
+_kernel_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_20x4_vs_lib8
+ .def kernel_spotrf_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_20x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_20x4_lib8, .-kernel_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_24x4_lib8
+ .type kernel_spotrf_nt_l_24x4_lib8, @function
+kernel_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_24x4_lib8
+_kernel_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_24x4_lib8
+ .def kernel_spotrf_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_24x4_lib8, .-kernel_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_24x4_vs_lib8
+ .type kernel_spotrf_nt_l_24x4_vs_lib8, @function
+kernel_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_24x4_vs_lib8
+_kernel_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_24x4_vs_lib8
+ .def kernel_spotrf_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_24x4_lib8, .-kernel_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_20x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_20x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_20x4_lib8
+_kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_20x4_lib8, .-kernel_ssyrk_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_24x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_24x4_lib8
+_kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_24x4_lib8, .-kernel_ssyrk_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_24x4_lib8
+ .type kernel_ssyrk_nt_l_24x4_lib8, @function
+kernel_ssyrk_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_24x4_lib8
+_kernel_ssyrk_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_24x4_lib8
+ .def kernel_ssyrk_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_24x4_lib8, .-kernel_ssyrk_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_ssyrk_nt_l_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_24x4_vs_lib8
+ .type kernel_ssyrk_nt_l_24x4_vs_lib8, @function
+kernel_ssyrk_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_24x4_vs_lib8
+_kernel_ssyrk_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_24x4_vs_lib8
+ .def kernel_ssyrk_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_24x4_vs_lib8, .-kernel_ssyrk_nt_l_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_20x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_20x4_lib8
+ .type kernel_ssyrk_nt_l_20x4_lib8, @function
+kernel_ssyrk_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_20x4_lib8
+_kernel_ssyrk_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_20x4_lib8
+ .def kernel_ssyrk_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_20x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_20x4_lib8, .-kernel_ssyrk_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_ssyrk_nt_l_20x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_20x4_vs_lib8
+ .type kernel_ssyrk_nt_l_20x4_vs_lib8, @function
+kernel_ssyrk_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_20x4_vs_lib8
+_kernel_ssyrk_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_20x4_vs_lib8
+ .def kernel_ssyrk_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_20x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_20x4_vs_lib8, .-kernel_ssyrk_nt_l_20x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_24x4_lib8
+ .type kernel_strmm_nn_rl_24x4_lib8, @function
+kernel_strmm_nn_rl_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_24x4_lib8
+_kernel_strmm_nn_rl_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_24x4_lib8
+ .def kernel_strmm_nn_rl_24x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_24x4_lib8, .-kernel_strmm_nn_rl_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_24x4_vs_lib8
+ .type kernel_strmm_nn_rl_24x4_vs_lib8, @function
+kernel_strmm_nn_rl_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_24x4_vs_lib8
+_kernel_strmm_nn_rl_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_24x4_vs_lib8
+ .def kernel_strmm_nn_rl_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_24x4_vs_lib8, .-kernel_strmm_nn_rl_24x4_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_8x4_lib8.S b/kernel/avx2/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..44946f1
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,7342 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // A
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // a
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+// vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+5: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+// vbroadcastf128 128(%r12), %ymm14 // B
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+// vbroadcastf128 32(%r12), %ymm15 // B
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // A
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // a
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+// vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// prefetcht0 0(%r12, %r13, 1) // software prefetch
+// prefetcht0 64(%r12, %r13, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ subl $8, %r10d
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ addq $256, %r11
+
+ // unroll 7
+ vmovaps -32(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ addq %r12, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 1) // software prefetch
+ prefetcht0 64(%r12, %r13, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+ subl $8, %r10d
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+ addq $256, %r11
+
+ // unroll 7
+ vmovaps -32(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+ addq %r12, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %r15d
+ subl %r14d, %r15d // 8-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r12, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r14d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r14d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movl $0, %r14d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r14d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r14d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_4x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %xmm13
+ vmulps %xmm0, %xmm13, %xmm0
+ vbroadcastss 4(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm1
+ vbroadcastss 8(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm2
+ vbroadcastss 12(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm3
+ vbroadcastss 16(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm4
+ vbroadcastss 20(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm5
+ vbroadcastss 24(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm6
+ vbroadcastss 28(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm7
+
+ vbroadcastss 4(%r11), %xmm13
+ vmulps %xmm1, %xmm13, %xmm1
+ vbroadcastss 40(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm2
+ vbroadcastss 44(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm3
+ vbroadcastss 48(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm4
+ vbroadcastss 52(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm5
+ vbroadcastss 56(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm6
+ vbroadcastss 60(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm7
+
+ vbroadcastss 8(%r11), %xmm13
+ vmulps %xmm2, %xmm13, %xmm2
+ vbroadcastss 76(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm3
+ vbroadcastss 80(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm4
+ vbroadcastss 84(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm5
+ vbroadcastss 88(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm6
+ vbroadcastss 92(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm7
+
+ vbroadcastss 12(%r11), %xmm13
+ vmulps %xmm3, %xmm13, %xmm3
+ vbroadcastss 112(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm4
+ vbroadcastss 116(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm5
+ vbroadcastss 120(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm6
+ vbroadcastss 124(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm7
+
+ vbroadcastss 16(%r11), %xmm13
+ vmulps %xmm4, %xmm13, %xmm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %xmm13
+ vfnmadd231ps %xmm4, %xmm13, %xmm5
+ vbroadcastss 152(%r10), %xmm13
+ vfnmadd231ps %xmm4, %xmm13, %xmm6
+ vbroadcastss 156(%r10), %xmm13
+ vfnmadd231ps %xmm4, %xmm13, %xmm7
+
+ vbroadcastss 20(%r11), %xmm13
+ vmulps %xmm5, %xmm13, %xmm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %xmm13
+ vfnmadd231ps %xmm5, %xmm13, %xmm6
+ vbroadcastss 188(%r10), %xmm13
+ vfnmadd231ps %xmm5, %xmm13, %xmm7
+
+ vbroadcastss 24(%r11), %xmm13
+ vmulps %xmm6, %xmm13, %xmm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %xmm13
+ vfnmadd231ps %xmm6, %xmm13, %xmm7
+
+ vbroadcastss 28(%r11), %xmm13
+ vmulps %xmm7, %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_4x8_vs_lib8, .-inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // transpose
+ vunpcklps %ymm1, %ymm0, %ymm5
+ vunpckhps %ymm1, %ymm0, %ymm4
+ vunpcklps %ymm3, %ymm2, %ymm7
+ vunpckhps %ymm3, %ymm2, %ymm6
+
+ vunpcklpd %ymm7, %ymm5, %ymm0
+ vunpckhpd %ymm7, %ymm5, %ymm1
+ vunpcklpd %ymm6, %ymm4, %ymm2
+ vunpckhpd %ymm6, %ymm4, %ymm3
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // transpose
+ vunpcklps %ymm1, %ymm0, %ymm5
+ vunpckhps %ymm1, %ymm0, %ymm4
+ vunpcklps %ymm3, %ymm2, %ymm7
+ vunpckhps %ymm3, %ymm2, %ymm6
+
+ vunpcklpd %ymm7, %ymm5, %ymm0
+ vunpckhpd %ymm7, %ymm5, %ymm1
+ vunpcklpd %ymm6, %ymm4, %ymm2
+ vunpckhpd %ymm6, %ymm4, %ymm3
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib8, @function
+inner_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib8, .-inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_4x8_lib8, @function
+inner_tran_scale_11_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x8_lib8:
+#endif
+#endif
+
+ // transpose
+ vunpcklps %ymm1, %ymm0, %ymm5
+ vunpckhps %ymm1, %ymm0, %ymm4
+ vunpcklps %ymm3, %ymm2, %ymm7
+ vunpckhps %ymm3, %ymm2, %ymm6
+
+ vunpcklpd %ymm7, %ymm5, %ymm0
+ vunpckhpd %ymm7, %ymm5, %ymm1
+ vunpcklpd %ymm6, %ymm4, %ymm2
+ vunpckhpd %ymm6, %ymm4, %ymm3
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm0
+ vmovaps 32(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm1
+ vmovaps 64(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm2
+ vmovaps 96(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm3
+ vmovaps 128(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm4
+ vmovaps 160(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm5
+ vmovaps 192(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm6
+ vmovaps 224(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_4x8_lib8, .-inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_gen_lib8, @function
+inner_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+
+ // offset==0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_gen_lib8, .-inner_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %r15 // C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+
+ vmovaps %xmm0, 0(%r10)
+ vmovaps %xmm1, 32(%r10)
+ vmovaps %xmm2, 64(%r10)
+ vmovaps %xmm3, 96(%r10)
+ vmovaps %xmm4, 128(%r10)
+ vmovaps %xmm5, 160(%r10)
+ vmovaps %xmm6, 192(%r10)
+ vmovaps %xmm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm12, %ymm14
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %ymm1, %ymm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r10)
+ je 0f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm12, %xmm14
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %xmm1, %xmm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %xmm2, %xmm14, 64(%r10)
+ cmpl $4, %r12d
+ jl 0f // end
+ vmaskmovps %xmm3, %xmm14, 96(%r10)
+ cmpl $5, %r12d
+ jl 0f // end
+ vmaskmovps %xmm4, %xmm14, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %xmm5, %xmm14, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %xmm6, %xmm14, 192(%r10)
+ je 0f // end
+ vmaskmovps %xmm7, %xmm14, 224(%r10)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ vmovaps %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ addq $32, %r11
+
+ cmpl $3, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ addq $32, %r11
+
+ cmpl $4, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ addq $32, %r11
+
+ cmpl $5, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ addq $32, %r11
+
+ cmpl $6, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %xmm1, %xmm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %xmm2, %xmm15, 64(%r11)
+ cmpl $4, %r15d
+ jl 7f // end
+ vmaskmovps %xmm3, %xmm15, 96(%r11)
+ cmpl $5, %r15d
+ jl 7f // end
+ vmaskmovps %xmm4, %xmm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %xmm5, %xmm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %xmm6, %xmm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %xmm7, %xmm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vblendps $0x3, %ymm13, %ymm2, %ymm2
+ vblendps $0x7, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ //
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+// 0 1 2 3 4 5 6 7 8
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib8(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_4x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_4x8_lib8
+ .type kernel_strsm_nt_rl_inv_4x8_lib8, @function
+kernel_strsm_nt_rl_inv_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_4x8_lib8
+_kernel_strsm_nt_rl_inv_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_4x8_lib8
+ .def kernel_strsm_nt_rl_inv_4x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG2, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_4x8_lib8, .-kernel_strsm_nt_rl_inv_4x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_4x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_4x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_4x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_4x8_vs_lib8
+_kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_4x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG2, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_4x8_vs_lib8, .-kernel_strsm_nt_rl_inv_4x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_8x8_lib8.S b/kernel/avx2/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..094acda
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5395 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+
+ cmpl $3, %r10d
+ jle 4f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vmovaps 32(%r11), %ymm13 // A
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 48(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 52(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 56(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 60(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vmovaps -64(%r11), %ymm13 // A
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 80(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 84(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 88(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 92(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r12
+
+ // unroll 0
+ vmovaps -32(%r11), %ymm13 // A
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss -16(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss -12(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss -8(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss -4(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 32(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 96(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+ vbroadcastf128 0(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 32(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 96(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 112(%r12), %ymm15 // B
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+// vbroadcastf128 0(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+// vbroadcastf128 16(%r12), %ymm15 // B
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vfmadd231ps %ymm12, %ymm14, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+
+ cmpl $3, %r10d
+ jle 4f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vmovaps 32(%r11), %ymm13 // A
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 48(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 52(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 56(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 60(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vmovaps -64(%r11), %ymm13 // A
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 80(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 84(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 88(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 92(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r12
+
+ // unroll 0
+ vmovaps -32(%r11), %ymm13 // A
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss -16(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss -12(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss -8(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss -4(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// prefetcht0 0(%r12, %r13, 1) // software prefetch
+// prefetcht0 64(%r12, %r13, 1) // software prefetch
+// prefetcht0 128(%r12, %r13, 1) // software prefetch
+// prefetcht0 192(%r12, %r13, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 132(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 164(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 196(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 228(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 136(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 168(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 200(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 232(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 140(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 172(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 204(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 236(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 144(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 176(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 208(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 240(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 148(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 180(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 212(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 244(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ subl $8, %r10d
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 152(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 184(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 216(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 248(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vmovaps -32(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 156(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 188(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 220(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 252(%r12), %ymm13 // B[7]
+ addq %r12, %r13
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r14d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vbroadcastss 16(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm4
+ vbroadcastss 20(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm5
+ vbroadcastss 24(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm6
+ vbroadcastss 28(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vbroadcastss 48(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm4
+ vbroadcastss 52(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm5
+ vbroadcastss 56(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm6
+ vbroadcastss 60(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vbroadcastss 80(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm4
+ vbroadcastss 84(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm5
+ vbroadcastss 88(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm6
+ vbroadcastss 92(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vbroadcastss 112(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm4
+ vbroadcastss 116(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm5
+ vbroadcastss 120(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm6
+ vbroadcastss 124(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm7
+
+ vbroadcastss 16(%r11), %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vbroadcastss 152(%r10), %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vbroadcastss 156(%r10), %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+ vbroadcastss 20(%r11), %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vbroadcastss 188(%r10), %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+ vbroadcastss 24(%r11), %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %ymm13
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+ vbroadcastss 28(%r11), %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vbroadcastss %xmm13, %ymm13
+// vpermilps $0x00, %xmm13, %xmm13
+// vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovss %xmm13, 12(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm4, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 9f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+10:
+ vmovss %xmm13, 16(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm5, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 11f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+12:
+ vmovss %xmm13, 20(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm6, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 13f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+14:
+ vmovss %xmm13, 24(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm7, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+16:
+ vmovss %xmm13, 28(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vmovaps 128(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x8_lib8, @function
+inner_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x8_lib8, .-inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x8_gen_lib8, @function
+inner_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x8_gen_lib8, .-inner_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vmovaps 128(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm4, 128(%r10)
+ vmovaps %ymm5, 160(%r10)
+ vmovaps %ymm6, 192(%r10)
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps 32(%r10), %ymm14
+ vblendps $0x01, %ymm14, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps 64(%r10), %ymm14
+ vblendps $0x03, %ymm14, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps 96(%r10), %ymm14
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps 128(%r10), %ymm14
+ vblendps $0x0f, %ymm14, %ymm4, %ymm4
+ vmovaps %ymm4, 128(%r10)
+ vmovaps 160(%r10), %ymm14
+ vblendps $0x1f, %ymm14, %ymm5, %ymm5
+ vmovaps %ymm5, 160(%r10)
+ vmovaps 192(%r10), %ymm14
+ vblendps $0x3f, %ymm14, %ymm6, %ymm6
+ vmovaps %ymm6, 192(%r10)
+ vmovaps 224(%r10), %ymm14
+ vblendps $0x7f, %ymm14, %ymm7, %ymm7
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmovaps 128(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmovaps 160(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmovaps 192(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmovaps 224(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmovaps 128(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmovaps 160(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmovaps 192(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmovaps 224(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nt_8x8_vs)lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // m1
+ movq ARG9, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/c99/Makefile b/kernel/c99/Makefile
new file mode 100644
index 0000000..55d54ef
--- /dev/null
+++ b/kernel/c99/Makefile
@@ -0,0 +1,80 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemv_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemv_4_lib4.o
+#OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+OBJS +=
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/c99/kernel_dgemm_4x4_lib4.c b/kernel/c99/kernel_dgemm_4x4_lib4.c
new file mode 100644
index 0000000..167e356
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_4x4_lib4.c
@@ -0,0 +1,6825 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC)
+void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ k = 0;
+ if(offsetB!=0)
+ {
+ if(offsetB==1)
+ {
+
+ B += 1;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ B += 2;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ else // if(offsetB==3)
+ {
+
+ B += 3;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ }
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ scale:
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0,
+ c_10=0, c_11=0,
+ c_20=0, c_21=0, c_22=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0,
+ c_10=0, c_11=0,
+ c_20=0, c_21=0, c_22=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[2+bs*2] = c_22;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ k = 0;
+
+ // k = 0
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 1
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 2
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double *D1;
+
+ int k;
+
+ B += offsetB;
+
+ k = 0;
+
+ if(offsetB==0)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==1)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 5
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else // if(offetB==3)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ store:
+
+ c_00 = alpha[0]*c_00;
+ c_10 = alpha[0]*c_10;
+ c_20 = alpha[0]*c_20;
+ c_30 = alpha[0]*c_30;
+
+ c_01 = alpha[0]*c_01;
+ c_11 = alpha[0]*c_11;
+ c_21 = alpha[0]*c_21;
+ c_31 = alpha[0]*c_31;
+
+ c_02 = alpha[0]*c_02;
+ c_12 = alpha[0]*c_12;
+ c_22 = alpha[0]*c_22;
+ c_32 = alpha[0]*c_32;
+
+ c_03 = alpha[0]*c_03;
+ c_13 = alpha[0]*c_13;
+ c_23 = alpha[0]*c_23;
+ c_33 = alpha[0]*c_33;
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+// c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+// c_02 = C[0+bs*2] + c_02;
+// c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+// c_03 = C[0+bs*3] + c_03;
+// c_13 = C[1+bs*3] + c_13;
+// c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(c_00>0)
+ {
+ c_00 = sqrt(c_00);
+ tmp = 1.0/c_00;
+ }
+ else
+ {
+ c_00 = 0.0;
+ tmp = 0.0;
+ }
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ c_11 -= c_10 * c_10;
+ c_21 -= c_20 * c_10;
+ c_31 -= c_30 * c_10;
+ if(c_11>0)
+ {
+ c_11 = sqrt(c_11);
+ tmp = 1.0/c_11;
+ }
+ else
+ {
+ c_11 = 0.0;
+ tmp = 0.0;
+ }
+ c_21 *= tmp;
+ c_31 *= tmp;
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ c_22 -= c_20 * c_20;
+ c_32 -= c_30 * c_20;
+ c_22 -= c_21 * c_21;
+ c_32 -= c_31 * c_21;
+ if(c_22>0)
+ {
+ c_22 = sqrt(c_22);
+ tmp = 1.0/c_22;
+ }
+ else
+ {
+ c_22 = 0.0;
+ tmp = 0.0;
+ }
+ c_32 *= tmp;
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ c_33 -= c_30 * c_30;
+ c_33 -= c_31 * c_31;
+ c_33 -= c_32 * c_32;
+ if(c_33>0)
+ {
+ c_33 = sqrt(c_33);
+ tmp = 1.0/c_33;
+ }
+ else
+ {
+ c_33 = 0.0;
+ tmp = 0.0;
+ }
+ inv_diag_D[3] = tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+
+ if(kn>3)
+ {
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+ tmp = E[2+bs*3];
+ c_02 -= c_03 * tmp;
+ c_12 -= c_13 * tmp;
+ c_22 -= c_23 * tmp;
+ c_32 -= c_33 * tmp;
+ tmp = E[1+bs*3];
+ c_01 -= c_03 * tmp;
+ c_11 -= c_13 * tmp;
+ c_21 -= c_23 * tmp;
+ c_31 -= c_33 * tmp;
+ tmp = E[0+bs*3];
+ c_00 -= c_03 * tmp;
+ c_10 -= c_13 * tmp;
+ c_20 -= c_23 * tmp;
+ c_30 -= c_33 * tmp;
+ }
+
+ if(kn>2)
+ {
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+ tmp = E[1+bs*2];
+ c_01 -= c_02 * tmp;
+ c_11 -= c_12 * tmp;
+ c_21 -= c_22 * tmp;
+ c_31 -= c_32 * tmp;
+ tmp = E[0+bs*2];
+ c_00 -= c_02 * tmp;
+ c_10 -= c_12 * tmp;
+ c_20 -= c_22 * tmp;
+ c_30 -= c_32 * tmp;
+ }
+
+ if(kn>1)
+ {
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+ tmp = E[0+bs*1];
+ c_00 -= c_01 * tmp;
+ c_10 -= c_11 * tmp;
+ c_20 -= c_21 * tmp;
+ c_30 -= c_31 * tmp;
+ }
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // factorization
+
+ // first column
+ tmp = 1.0 / c_00;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ // second column
+ c_11 -= c_10 * c_01;
+ c_21 -= c_20 * c_01;
+ c_31 -= c_30 * c_01;
+
+ tmp = 1.0 / c_11;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ // third column
+ c_12 -= c_10 * c_02;
+ c_22 -= c_20 * c_02;
+ c_32 -= c_30 * c_02;
+
+ c_22 -= c_21 * c_12;
+ c_32 -= c_31 * c_12;
+
+ tmp = 1.0 / c_22;
+ c_32 *= tmp;
+
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ // fourth column
+ c_13 -= c_10 * c_03;
+ c_23 -= c_20 * c_03;
+ c_33 -= c_30 * c_03;
+
+ c_23 -= c_21 * c_13;
+ c_33 -= c_31 * c_13;
+
+ c_33 -= c_32 * c_23;
+
+ tmp = 1.0 / c_33;
+
+ inv_diag_D[3] = tmp;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_1, e_2, e_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solution
+
+ if(km==1)
+ goto store;
+
+ e_1 = E[1+bs*0];
+ e_2 = E[2+bs*0];
+ e_3 = E[3+bs*0];
+ c_10 -= e_1 * c_00;
+ c_20 -= e_2 * c_00;
+ c_30 -= e_3 * c_00;
+ c_11 -= e_1 * c_01;
+ c_21 -= e_2 * c_01;
+ c_31 -= e_3 * c_01;
+ c_12 -= e_1 * c_02;
+ c_22 -= e_2 * c_02;
+ c_32 -= e_3 * c_02;
+ c_13 -= e_1 * c_03;
+ c_23 -= e_2 * c_03;
+ c_33 -= e_3 * c_03;
+
+ if(km==2)
+ goto store;
+
+ e_2 = E[2+bs*1];
+ e_3 = E[3+bs*1];
+ c_20 -= e_2 * c_10;
+ c_30 -= e_3 * c_10;
+ c_21 -= e_2 * c_11;
+ c_31 -= e_3 * c_11;
+ c_22 -= e_2 * c_12;
+ c_32 -= e_3 * c_12;
+ c_23 -= e_2 * c_13;
+ c_33 -= e_3 * c_13;
+
+ if(km==3)
+ goto store;
+
+ e_3 = E[3+bs*2];
+ c_30 -= e_3 * c_20;
+ c_31 -= e_3 * c_21;
+ c_32 -= e_3 * c_22;
+ c_33 -= e_3 * c_23;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solve
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_10 *= e_00;
+ c_20 *= e_00;
+ c_30 *= e_00;
+
+ if(kn==1)
+ goto store;
+
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_01 -= c_00 * e_01;
+ c_11 -= c_10 * e_01;
+ c_21 -= c_20 * e_01;
+ c_31 -= c_30 * e_01;
+ c_01 *= e_11;
+ c_11 *= e_11;
+ c_21 *= e_11;
+ c_31 *= e_11;
+
+ if(kn==2)
+ goto store;
+
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_02 -= c_00 * e_02;
+ c_12 -= c_10 * e_02;
+ c_22 -= c_20 * e_02;
+ c_32 -= c_30 * e_02;
+ c_02 -= c_01 * e_12;
+ c_12 -= c_11 * e_12;
+ c_22 -= c_21 * e_12;
+ c_32 -= c_31 * e_12;
+ c_02 *= e_22;
+ c_12 *= e_22;
+ c_22 *= e_22;
+ c_32 *= e_22;
+
+ if(kn==3)
+ goto store;
+
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_03 -= c_00 * e_03;
+ c_13 -= c_10 * e_03;
+ c_23 -= c_20 * e_03;
+ c_33 -= c_30 * e_03;
+ c_03 -= c_01 * e_13;
+ c_13 -= c_11 * e_13;
+ c_23 -= c_21 * e_13;
+ c_33 -= c_31 * e_13;
+ c_03 -= c_02 * e_23;
+ c_13 -= c_12 * e_23;
+ c_23 -= c_22 * e_23;
+ c_33 -= c_32 * e_23;
+ c_03 *= e_33;
+ c_13 *= e_33;
+ c_23 *= e_33;
+ c_33 *= e_33;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+
+ // solve
+
+ if(km>3)
+ {
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_30 *= e_33;
+ c_31 *= e_33;
+ c_32 *= e_33;
+ c_33 *= e_33;
+ c_00 -= e_03 * c_30;
+ c_01 -= e_03 * c_31;
+ c_02 -= e_03 * c_32;
+ c_03 -= e_03 * c_33;
+ c_10 -= e_13 * c_30;
+ c_11 -= e_13 * c_31;
+ c_12 -= e_13 * c_32;
+ c_13 -= e_13 * c_33;
+ c_20 -= e_23 * c_30;
+ c_21 -= e_23 * c_31;
+ c_22 -= e_23 * c_32;
+ c_23 -= e_23 * c_33;
+ }
+
+ if(km>2)
+ {
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_20 *= e_22;
+ c_21 *= e_22;
+ c_22 *= e_22;
+ c_23 *= e_22;
+ c_00 -= e_02 * c_20;
+ c_01 -= e_02 * c_21;
+ c_02 -= e_02 * c_22;
+ c_03 -= e_02 * c_23;
+ c_10 -= e_12 * c_20;
+ c_11 -= e_12 * c_21;
+ c_12 -= e_12 * c_22;
+ c_13 -= e_12 * c_23;
+ }
+
+ if(km>1)
+ {
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_10 *= e_11;
+ c_11 *= e_11;
+ c_12 *= e_11;
+ c_13 *= e_11;
+ c_00 -= e_01 * c_10;
+ c_01 -= e_01 * c_11;
+ c_02 -= e_01 * c_12;
+ c_03 -= e_01 * c_13;
+ }
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_01 *= e_00;
+ c_02 *= e_00;
+ c_03 *= e_00;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
diff --git a/kernel/c99/kernel_dgemm_diag_lib4.c b/kernel/c99/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..cad2b21
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,1111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_0;
+ c_2 = a_2 * b_0;
+ c_3 = a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = a_0 * b_1;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_1;
+ c_3 = a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = a_0 * b_2;
+ c_1 = a_1 * b_2;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = a_0 * b_3;
+ c_1 = a_1 * b_3;
+ c_2 = a_2 * b_3;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1,
+ b_0, b_1,
+ c_0, c_1;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
diff --git a/kernel/c99/kernel_dgemv_4_lib4.c b/kernel/c99/kernel_dgemv_4_lib4.c
new file mode 100644
index 0000000..9f11b5f
--- /dev/null
+++ b/kernel/c99/kernel_dgemv_4_lib4.c
@@ -0,0 +1,1009 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ x_0 = x[1];
+
+ y_0 += A[0+bs*1] * x_0;
+ y_1 += A[1+bs*1] * x_0;
+ y_2 += A[2+bs*1] * x_0;
+ y_3 += A[3+bs*1] * x_0;
+
+ x_0 = x[2];
+
+ y_0 += A[0+bs*2] * x_0;
+ y_1 += A[1+bs*2] * x_0;
+ y_2 += A[2+bs*2] * x_0;
+ y_3 += A[3+bs*2] * x_0;
+
+ x_0 = x[3];
+
+ y_0 += A[0+bs*3] * x_0;
+ y_1 += A[1+bs*3] * x_0;
+ y_2 += A[2+bs*3] * x_0;
+ y_3 += A[3+bs*3] * x_0;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(k0<=0 & k1>3)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ if(k0<=0 & k1>0) z[0] = y_0;
+ if(k0<=1 & k1>1) z[1] = y_1;
+ if(k0<=2 & k1>2) z[2] = y_2;
+ if(k0<=3 & k1>3) z[3] = y_3;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_vs_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1)
+ {
+
+ kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z)
+ {
+
+ kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km)
+ {
+
+ const int bs = 4;
+
+ int k, kend;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ if(offA!=0) // 1, 2, 3
+ {
+ kend = 4-offA<kmax ? 4-offA : kmax;
+ for(; k<kend; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+ A += bs*(sda-1);
+ }
+ for(; k<kmax-bs+1; k+=bs)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z)
+ {
+
+ kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_vs_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1)
+ {
+
+ kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_vs_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[1+bs*0] * x_0;
+ y_2 -= A[2+bs*0] * x_0;
+ y_3 -= A[3+bs*0] * x_0;
+
+ y_0 -= A[0+bs*1] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[2+bs*1] * x_1;
+ y_3 -= A[3+bs*1] * x_1;
+
+ y_0 -= A[0+bs*2] * x_2;
+ y_1 -= A[1+bs*2] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[3+bs*2] * x_2;
+
+ y_0 -= A[0+bs*3] * x_3;
+ y_1 -= A[1+bs*3] * x_3;
+ y_2 -= A[2+bs*3] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ double
+ a_00, a_10, a_20, a_30,
+ a_11, a_21, a_31;
+
+ // a_00
+ a_00 = inv_diag_A[0];
+ a_10 = A[1+bs*0];
+ a_20 = A[2+bs*0];
+ a_30 = A[3+bs*0];
+ y_0 *= a_00;
+ z[0] = y_0;
+ y_1 -= a_10 * y_0;
+ y_2 -= a_20 * y_0;
+ y_3 -= a_30 * y_0;
+
+ if(kn==1)
+ {
+ if(km==1)
+ return;
+ y[1] = y_1;
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_11
+ a_11 = inv_diag_A[1];
+ a_21 = A[2+bs*1];
+ a_31 = A[3+bs*1];
+ y_1 *= a_11;
+ z[1] = y_1;
+ y_2 -= a_21 * y_1;
+ y_3 -= a_31 * y_1;
+
+ if(kn==2)
+ {
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_22
+ a_00 = inv_diag_A[2];
+ a_10 = A[3+bs*2];
+ y_2 *= a_00;
+ z[2] = y_2;
+ y_3 -= a_10 * y_2;
+
+ if(kn==3)
+ {
+ if(km==3)
+ return;
+ y[3] = y_3;
+
+ return;
+ }
+
+ // a_33
+ a_11 = inv_diag_A[3];
+ y_3 *= a_11;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ kernel_dtrsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_4_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+ y_3 -= A[1+bs*3] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[2+bs*3] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_3 *= inv_diag_A[3];
+ z[3] = y_3;
+
+ y_2 -= A[3+bs*2] * y_3;
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+ y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_3_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0;
+
+ k = 3;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_3 = x[3];
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 3;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2;
+ y_1 -= A[2+bs*1]*y_2;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_2_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0;
+
+ k = 2;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 2;
+ x += 2;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_1_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0;
+
+ k = 1;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 1;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_un_4_lib4(int kmax, double *A, double *x, double *z)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+/* y_1 += A[1+bs*0] * x_0;*/
+/* y_2 += A[2+bs*0] * x_0;*/
+/* y_3 += A[3+bs*0] * x_0;*/
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+/* y_2 += A[2+bs*1] * x_1;*/
+/* y_3 += A[3+bs*1] * x_1;*/
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+/* y_3 += A[3+bs*2] * x_2;*/
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ k=4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[2+bs*1] * x_1;
+ y_3 += A[3+bs*1] * x_1;
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[3+bs*2] * x_2;
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_vs_lib4(int kmax, double *A, int sda, double *x, double *z, int km)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-4; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+/* y_0 += A[1+bs*0] * x_1;*/
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+/* y_0 += A[2+bs*0] * x_2;*/
+/* y_1 += A[2+bs*1] * x_2;*/
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+/* y_0 += A[3+bs*0] * x_3;*/
+/* y_1 += A[3+bs*1] * x_3;*/
+/* y_2 += A[3+bs*2] * x_3;*/
+ y_3 += A[3+bs*3] * x_3;
+
+// A += sda*bs;
+// x += 4;
+
+ // store_vs
+ store:
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_lib4(int kmax, double *A, int sda, double *x, double *z)
+ {
+
+ kernel_dtrmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+ }
+#endif
+
+
+
+
+
diff --git a/kernel/c99/kernel_dgeqrf_4_lib4.c b/kernel/c99/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..071ec86
--- /dev/null
+++ b/kernel/c99/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2620 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ ii = 1;
+ if(m>1)
+ {
+ tmp = pD[1+ps*0];
+ beta += tmp*tmp;
+ if(m>2)
+ {
+ tmp = pD[2+ps*0];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*0];
+ beta += tmp*tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ ii = 1;
+ if(m>1)
+ {
+ pD[1+ps*0] *= tmp;
+ if(m>2)
+ {
+ pD[2+ps*0] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*0] *= tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*0] *= tmp;
+ pD[1+ii*sdd+ps*0] *= tmp;
+ pD[2+ii*sdd+ps*0] *= tmp;
+ pD[3+ii*sdd+ps*0] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[0+ps*1];
+ w2 = pD[0+ps*2];
+ w3 = pD[0+ps*3];
+ if(m>1)
+ {
+ w1 += pD[1+ps*1] * pD[1+ps*0];
+ w2 += pD[1+ps*2] * pD[1+ps*0];
+ w3 += pD[1+ps*3] * pD[1+ps*0];
+ if(m>2)
+ {
+ w1 += pD[2+ps*1] * pD[2+ps*0];
+ w2 += pD[2+ps*2] * pD[2+ps*0];
+ w3 += pD[2+ps*3] * pD[2+ps*0];
+ if(m>3)
+ {
+ w1 += pD[3+ps*1] * pD[3+ps*0];
+ w2 += pD[3+ps*2] * pD[3+ps*0];
+ w3 += pD[3+ps*3] * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[0+ps*1] += w1;
+ pD[0+ps*2] += w2;
+ pD[0+ps*3] += w3;
+ if(m>1)
+ {
+ pD[1+ps*1] += w1 * pD[1+ps*0];
+ pD[1+ps*2] += w2 * pD[1+ps*0];
+ pD[1+ps*3] += w3 * pD[1+ps*0];
+ if(m>2)
+ {
+ pD[2+ps*1] += w1 * pD[2+ps*0];
+ pD[2+ps*2] += w2 * pD[2+ps*0];
+ pD[2+ps*3] += w3 * pD[2+ps*0];
+ if(m>3)
+ {
+ pD[3+ps*1] += w1 * pD[3+ps*0];
+ pD[3+ps*2] += w2 * pD[3+ps*0];
+ pD[3+ps*3] += w3 * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+ }
+ if(m==1)
+ return;
+ // second column
+ beta = 0.0;
+ if(m>2)
+ {
+ tmp = pD[2+ps*1];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*1];
+ beta += tmp*tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ if(m>2)
+ {
+ pD[2+ps*1] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*1] *= tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] *= tmp;
+ pD[1+ii*sdd+ps*1] *= tmp;
+ pD[2+ii*sdd+ps*1] *= tmp;
+ pD[3+ii*sdd+ps*1] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[1+ps*2];
+ w3 = pD[1+ps*3];
+ if(m>2)
+ {
+ w2 += pD[2+ps*2] * pD[2+ps*1];
+ w3 += pD[2+ps*3] * pD[2+ps*1];
+ if(m>3)
+ {
+ w2 += pD[3+ps*2] * pD[3+ps*1];
+ w3 += pD[3+ps*3] * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[1+ps*2] += w2;
+ pD[1+ps*3] += w3;
+ if(m>2)
+ {
+ pD[2+ps*2] += w2 * pD[2+ps*1];
+ pD[2+ps*3] += w3 * pD[2+ps*1];
+ if(m>3)
+ {
+ pD[3+ps*2] += w2 * pD[3+ps*1];
+ pD[3+ps*3] += w3 * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+ }
+ if(m==2)
+ return;
+ // third column
+ beta = 0.0;
+ if(m>3)
+ {
+ tmp = pD[3+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ if(m>3)
+ {
+ pD[3+ps*2] *= tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] *= tmp;
+ pD[1+ii*sdd+ps*2] *= tmp;
+ pD[2+ii*sdd+ps*2] *= tmp;
+ pD[3+ii*sdd+ps*2] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[2+ps*3];
+ if(m>3)
+ {
+ w3 += pD[3+ps*3] * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ w3 = - dD[2] * w3;
+ pD[2+ps*3] += w3;
+ if(m>3)
+ {
+ pD[3+ps*3] += w3 * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+ }
+ if(m==3)
+ return;
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] *= tmp;
+ pD[1+ii*sdd+ps*3] *= tmp;
+ pD[2+ii*sdd+ps*3] *= tmp;
+ pD[3+ii*sdd+ps*3] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k; //m<n ? m : n;
+ double alpha, beta, tmp, w0;
+ double *pC00, *pC10, *pC01, *pC11;
+ int offset;
+ double *pD0 = pD-offD;
+ for(ii=0; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ tmp = pC10[1+offset];
+ beta += tmp*tmp;
+ tmp = pC10[2+offset];
+ beta += tmp*tmp;
+ tmp = pC10[3+offset];
+ beta += tmp*tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ pC10[0+offset] *= tmp;
+ pC10[1+offset] *= tmp;
+ pC10[2+offset] *= tmp;
+ pC10[3+offset] *= tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ pC00[0] = beta;
+ }
+ if(ii<n)
+ {
+ pC01 = pC00 + ps;
+ pC11 = pC10 + ps;
+ kmax = jmax;
+ kmax0 = jmax0;
+ jmax = n-ii-1;
+ jj = 0;
+ for( ; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ps*jj] * 1.0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+ w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+ w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ps*jj] += w0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ offset = offset-ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+ pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+ pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 2;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[1+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pD[1+ps*0];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] = tmp;
+ tmp = pC[1+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pD[2+ps*0];
+ d1 = pD[2+ps*1];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] = tmp;
+ tmp = pC[2+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] = tmp;
+ if(m>3)
+ {
+ d0 = pD[3+ps*0];
+ d1 = pD[3+ps*1];
+ d2 = pD[3+ps*2];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] = tmp;
+ tmp = pC[3+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ pC[0+ps*1] -= pW[1+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+ps*0];
+ pW[0+ldw*1] = tmp;
+ if(m>2)
+ {
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+ps*1];
+ pW[0+ldw*2] = tmp;
+ if(m>3)
+ {
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+ps*2];
+ pW[0+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ }
+
+ return;
+ }
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double c00, c01,
+ c10, c11,
+ c20, c21,
+ c30, c31;
+ double a0, a1, a2, a3, b0, b1;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 4;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[0+ldw*1] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] = tmp;
+ tmp = pC[1+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] = tmp;
+ tmp = pC[2+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] = tmp;
+ tmp = pC[3+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ }
+ // compute W^T *= T
+ pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+ pW[3+ldw*1] = pT[3+ldt*0]*pW[0+ldw*1] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[2+ldw*1] + pT[3+ldt*3]*pW[3+ldw*1];
+ pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+ pW[2+ldw*1] = pT[2+ldt*0]*pW[0+ldw*1] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[2+ldw*1];
+ pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[0+ldw*1] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[0+ldw*1] = pT[0+ldt*0]*pW[0+ldw*1];
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ldw*1];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ldw*1];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c30 -= b0;
+ b1 = pW[3+ldw*1];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[0+jj*sdc+ps*1] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[1+jj*sdc+ps*1] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[2+jj*sdc+ps*1] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[3+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[0+jj*sdc+ps*1] = c01;
+ pC[1+jj*sdc+ps*1] = c11;
+ pC[2+jj*sdc+ps*1] = c21;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ c01 = pC[ll+jj*sdc+ps*1];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[1+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[2+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[3+ldw*1];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ pC[ll+jj*sdc+ps*1] = c01;
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ }
+ // compute W^T *= T
+ pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+ pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+ pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c30 -= b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ }
+ }
+
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ for(ii=1; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ }
+ // second column
+ beta = 0.0;
+ for(ii=2; ii<n; ii++)
+ {
+ tmp = pD[1+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ for(ii=2; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ }
+ // third column
+ beta = 0.0;
+ for(ii=3; ii<n; ii++)
+ {
+ tmp = pD[2+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ for(ii=3; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[3+ps*2];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ w3 = - dD[2] * w3;
+ pD[3+ps*2] += w3;
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ }
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ tmp = pD[3+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp;
+ double w00, w01,
+ w10, w11,
+ w20, w21,
+ w30, w31;
+ double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+ double pT[4];
+ int ldt = 2;
+ double *pD0 = pD-offD;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first row
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ kmax = n-ii;
+ w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ // second row
+ pC11 = pC10+ps*1;
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC11[0+ps*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ps*jj] *= tmp;
+ }
+ // compute T
+ kmax = n-ii;
+ tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+ pT[0+ldt*0] = dD[ii+0];
+ pT[0+ldt*1] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldt*1] = dD[ii+1];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-2;
+ jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+ pC20 = pC20a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ pC20 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w10 = pC20[1+ps*0]*1.0 + pC20[1+ps*1]*pC00[0+ps*1];
+ w20 = pC20[2+ps*0]*1.0 + pC20[2+ps*1]*pC00[0+ps*1];
+ w30 = pC20[3+ps*0]*1.0 + pC20[3+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ w11 = pC20[1+ps*0]*0.0 + pC20[1+ps*1]*1.0;
+ w21 = pC20[2+ps*0]*0.0 + pC20[2+ps*1]*1.0;
+ w31 = pC20[3+ps*0]*0.0 + pC20[3+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w10 += pC20[1+ps*kk]*pC00[0+ps*kk];
+ w20 += pC20[2+ps*kk]*pC00[0+ps*kk];
+ w30 += pC20[3+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ w11 += pC20[1+ps*kk]*pC10[0+ps*kk];
+ w21 += pC20[2+ps*kk]*pC10[0+ps*kk];
+ w31 += pC20[3+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w11 = - w10*pT[0+ldt*1] - w11*pT[1+ldt*1];
+ w21 = - w20*pT[0+ldt*1] - w21*pT[1+ldt*1];
+ w31 = - w30*pT[0+ldt*1] - w31*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ w10 = - w10*pT[0+ldt*0];
+ w20 = - w20*pT[0+ldt*0];
+ w30 = - w30*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[1+ps*0] += w10*1.0 + w11*0.0;
+ pC20[2+ps*0] += w20*1.0 + w21*0.0;
+ pC20[3+ps*0] += w30*1.0 + w31*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ pC20[1+ps*1] += w10*pC00[0+ps*1] + w11*1.0;
+ pC20[2+ps*1] += w20*pC00[0+ps*1] + w21*1.0;
+ pC20[3+ps*1] += w30*pC00[0+ps*1] + w31*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ pC20[1+ps*kk] += w10*pC00[0+ps*kk] + w11*pC10[0+ps*kk];
+ pC20[2+ps*kk] += w20*pC00[0+ps*kk] + w21*pC10[0+ps*kk];
+ pC20[3+ps*kk] += w30*pC00[0+ps*kk] + w31*pC10[0+ps*kk];
+ }
+ pC20 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ if(ii<n)
+ {
+ kmax = n-ii;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ pC10 = pC10a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ pC10 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ w00 = pC10[0+ps*0];
+ w10 = pC10[1+ps*0];
+ w20 = pC10[2+ps*0];
+ w30 = pC10[3+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk]*pC00[0+ps*kk];
+ w10 += pC10[1+ps*kk]*pC00[0+ps*kk];
+ w20 += pC10[2+ps*kk]*pC00[0+ps*kk];
+ w30 += pC10[3+ps*kk]*pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ w10 = - w10*dD[ii];
+ w20 = - w20*dD[ii];
+ w30 = - w30*dD[ii];
+ pC10[0+ps*0] += w00;
+ pC10[1+ps*0] += w10;
+ pC10[2+ps*0] += w20;
+ pC10[3+ps*0] += w30;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00*pC00[0+ps*kk];
+ pC10[1+ps*kk] += w10*pC00[0+ps*kk];
+ pC10[2+ps*kk] += w20*pC00[0+ps*kk];
+ pC10[3+ps*kk] += w30*pC00[0+ps*kk];
+ }
+ pC10 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+ {
+ const int ps = 4;
+ int kk;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ // 0
+ // 1
+ v10 = pD[0+ps*1];
+ // 2
+ v10 += pD[1+ps*2]*pD[0+ps*2];
+ v20 = pD[0+ps*2];
+ v21 = pD[1+ps*2];
+ // 3
+ v10 += pD[1+ps*3]*pD[0+ps*3];
+ v20 += pD[2+ps*3]*pD[0+ps*3];
+ v21 += pD[2+ps*3]*pD[1+ps*3];
+ v30 = pD[0+ps*3];
+ v31 = pD[1+ps*3];
+ v32 = pD[2+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ v10 += pD[1+ps*kk]*pD[0+ps*kk];
+ v20 += pD[2+ps*kk]*pD[0+ps*kk];
+ v30 += pD[3+ps*kk]*pD[0+ps*kk];
+ v21 += pD[2+ps*kk]*pD[1+ps*kk];
+ v31 += pD[3+ps*kk]*pD[1+ps*kk];
+ v32 += pD[3+ps*kk]*pD[2+ps*kk];
+ }
+ pT[0+ps*0] = - dD[0];
+ pT[1+ps*1] = - dD[1];
+ pT[2+ps*2] = - dD[2];
+ pT[3+ps*3] = - dD[3];
+ pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+ pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+ pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+ pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+ pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ return;
+ }
+
+
+
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ pW[1+ps*0] = pD[1+ps*0];
+ pW[2+ps*0] = pD[2+ps*0];
+ pW[3+ps*0] = pD[3+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[1+ps*0] += pD[1+ps*1]*pV[0+ps*1];
+ pW[2+ps*0] += pD[2+ps*1]*pV[0+ps*1];
+ pW[3+ps*0] += pD[3+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ pW[1+ps*1] = pD[1+ps*1];
+ pW[2+ps*1] = pD[2+ps*1];
+ pW[3+ps*1] = pD[3+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[1+ps*0] += pD[1+ps*2]*pV[0+ps*2];
+ pW[2+ps*0] += pD[2+ps*2]*pV[0+ps*2];
+ pW[3+ps*0] += pD[3+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[1+ps*1] += pD[1+ps*2]*pV[1+ps*2];
+ pW[2+ps*1] += pD[2+ps*2]*pV[1+ps*2];
+ pW[3+ps*1] += pD[3+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ pW[1+ps*2] = pD[1+ps*2];
+ pW[2+ps*2] = pD[2+ps*2];
+ pW[3+ps*2] = pD[3+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[1+ps*0] += pD[1+ps*3]*pV[0+ps*3];
+ pW[2+ps*0] += pD[2+ps*3]*pV[0+ps*3];
+ pW[3+ps*0] += pD[3+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[1+ps*1] += pD[1+ps*3]*pV[1+ps*3];
+ pW[2+ps*1] += pD[2+ps*3]*pV[1+ps*3];
+ pW[3+ps*1] += pD[3+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[1+ps*2] += pD[1+ps*3]*pV[2+ps*3];
+ pW[2+ps*2] += pD[2+ps*3]*pV[2+ps*3];
+ pW[3+ps*2] += pD[3+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ pW[1+ps*3] = pD[1+ps*3];
+ pW[2+ps*3] = pD[2+ps*3];
+ pW[3+ps*3] = pD[3+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[1+ps*0] += pD[1+ps*kk]*pV[0+ps*kk];
+ pW[2+ps*0] += pD[2+ps*kk]*pV[0+ps*kk];
+ pW[3+ps*0] += pD[3+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[1+ps*1] += pD[1+ps*kk]*pV[1+ps*kk];
+ pW[2+ps*1] += pD[2+ps*kk]*pV[1+ps*kk];
+ pW[3+ps*1] += pD[3+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[1+ps*2] += pD[1+ps*kk]*pV[2+ps*kk];
+ pW[2+ps*2] += pD[2+ps*kk]*pV[2+ps*kk];
+ pW[3+ps*2] += pD[3+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ pW[1+ps*3] += pD[1+ps*kk]*pV[3+ps*kk];
+ pW[2+ps*3] += pD[2+ps*kk]*pV[3+ps*kk];
+ pW[3+ps*3] += pD[3+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ pW[1+ps*3] = pW[1+ps*0]*pT[0+ps*3] + pW[1+ps*1]*pT[1+ps*3] + pW[1+ps*2]*pT[2+ps*3] + pW[1+ps*3]*pT[3+ps*3];
+ pW[2+ps*3] = pW[2+ps*0]*pT[0+ps*3] + pW[2+ps*1]*pT[1+ps*3] + pW[2+ps*2]*pT[2+ps*3] + pW[2+ps*3]*pT[3+ps*3];
+ pW[3+ps*3] = pW[3+ps*0]*pT[0+ps*3] + pW[3+ps*1]*pT[1+ps*3] + pW[3+ps*2]*pT[2+ps*3] + pW[3+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ pW[1+ps*2] = pW[1+ps*0]*pT[0+ps*2] + pW[1+ps*1]*pT[1+ps*2] + pW[1+ps*2]*pT[2+ps*2];
+ pW[2+ps*2] = pW[2+ps*0]*pT[0+ps*2] + pW[2+ps*1]*pT[1+ps*2] + pW[2+ps*2]*pT[2+ps*2];
+ pW[3+ps*2] = pW[3+ps*0]*pT[0+ps*2] + pW[3+ps*1]*pT[1+ps*2] + pW[3+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ pW[1+ps*1] = pW[1+ps*0]*pT[0+ps*1] + pW[1+ps*1]*pT[1+ps*1];
+ pW[2+ps*1] = pW[2+ps*0]*pT[0+ps*1] + pW[2+ps*1]*pT[1+ps*1];
+ pW[3+ps*1] = pW[3+ps*0]*pT[0+ps*1] + pW[3+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ pW[1+ps*0] = pW[1+ps*0]*pT[0+ps*0];
+ pW[2+ps*0] = pW[2+ps*0]*pT[0+ps*0];
+ pW[3+ps*0] = pW[3+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ pD[1+ps*0] += pW[1+ps*0];
+ pD[2+ps*0] += pW[2+ps*0];
+ pD[3+ps*0] += pW[3+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ pD[1+ps*1] += pW[1+ps*0]*pV[0+ps*1] + pW[1+ps*1];
+ pD[2+ps*1] += pW[2+ps*0]*pV[0+ps*1] + pW[2+ps*1];
+ pD[3+ps*1] += pW[3+ps*0]*pV[0+ps*1] + pW[3+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ pD[1+ps*2] += pW[1+ps*0]*pV[0+ps*2] + pW[1+ps*1]*pV[1+ps*2] + pW[1+ps*2];
+ pD[2+ps*2] += pW[2+ps*0]*pV[0+ps*2] + pW[2+ps*1]*pV[1+ps*2] + pW[2+ps*2];
+ pD[3+ps*2] += pW[3+ps*0]*pV[0+ps*2] + pW[3+ps*1]*pV[1+ps*2] + pW[3+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ pD[1+ps*3] += pW[1+ps*0]*pV[0+ps*3] + pW[1+ps*1]*pV[1+ps*3] + pW[1+ps*2]*pV[2+ps*3] + pW[1+ps*3];
+ pD[2+ps*3] += pW[2+ps*0]*pV[0+ps*3] + pW[2+ps*1]*pV[1+ps*3] + pW[2+ps*2]*pV[2+ps*3] + pW[2+ps*3];
+ pD[3+ps*3] += pW[3+ps*0]*pV[0+ps*3] + pW[3+ps*1]*pV[1+ps*3] + pW[3+ps*2]*pV[2+ps*3] + pW[3+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ pD[1+ps*kk] += pW[1+ps*0]*pV[0+ps*kk] + pW[1+ps*1]*pV[1+ps*kk] + pW[1+ps*2]*pV[2+ps*kk] + pW[1+ps*3]*pV[3+ps*kk];
+ pD[2+ps*kk] += pW[2+ps*0]*pV[0+ps*kk] + pW[2+ps*1]*pV[1+ps*kk] + pW[2+ps*2]*pV[2+ps*kk] + pW[2+ps*3]*pV[3+ps*kk];
+ pD[3+ps*kk] += pW[3+ps*0]*pV[0+ps*kk] + pW[3+ps*1]*pV[1+ps*kk] + pW[3+ps*2]*pV[2+ps*kk] + pW[3+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
diff --git a/kernel/c99/kernel_dgetrf_pivot_4_lib4.c b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..787322e
--- /dev/null
+++ b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,779 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numbering, starting from 0
+void didamax_lib4(int n, int offset, double *pA, int sda, int *p_idamax, double *p_amax)
+ {
+
+ int idamax, ii;
+ double tmp, amax;
+
+ p_idamax[0] = -1;
+ if(n<1)
+ return;
+
+ const int bs = 4;
+
+ int na = (bs - offset%bs)%bs;
+ na = n<na ? n : na;
+
+ amax = -1.0;
+ ii = 0;
+ if(na>0)
+ {
+ for( ; ii<na; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ tmp = fabs(pA[1]);
+ if(tmp>amax)
+ {
+ idamax = ii+1;
+ amax = tmp;
+ }
+ tmp = fabs(pA[2]);
+ if(tmp>amax)
+ {
+ idamax = ii+2;
+ amax = tmp;
+ }
+ tmp = fabs(pA[3]);
+ if(tmp>amax)
+ {
+ idamax = ii+3;
+ amax = tmp;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<n; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+
+ p_amax[0] = amax;
+ p_idamax[0] = idamax;
+
+ return;
+
+ }
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ double
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+ didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ // second column
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ // third column
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+
+ // fourth column
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+
+ return;
+
+ }
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ double
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+
+ // find pivot & scale
+ didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ if(m>=4)
+ {
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {1,2,3}
+ {
+ if(m>1)
+ {
+ pA[1+bs*0] *= tmp0;
+ if(m>2)
+ pA[2+bs*0] *= tmp0;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+ return;
+
+ // second column
+
+ // correct
+ if(m>=4)
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*1];
+ tmp2 -= pA[2+bs*0] * u_01;
+ pA[2+bs*1] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ if(m>=4)
+ {
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ if(m>2)
+ pA[2+bs*1] *= tmp1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ if(n==2)
+ return;
+
+ // third column
+
+ // correct
+ if(m>=4)
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ if(m>2)
+ {
+ didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ if(m>=4)
+ {
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n<4)
+ return;
+
+ // fourth column
+
+ // correct
+ if(m>=4)
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ if(m>2)
+ {
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ }
+ }
+
+ if(m>3)
+ {
+ // find pivot & scale
+ didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+
+
diff --git a/kernel/c99/kernel_dsymv_4_lib4.c b/kernel/c99/kernel_dsymv_4_lib4.c
new file mode 100644
index 0000000..bed4300
--- /dev/null
+++ b/kernel/c99/kernel_dsymv_4_lib4.c
@@ -0,0 +1,1024 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t)
+ {
+
+ kernel_dgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x_n, double *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ double *x_t = x_n;
+ double *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x_n, double *z_n)
+ {
+
+ kernel_dsymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
diff --git a/kernel/c99/kernel_sgecp_lib4.c b/kernel/c99/kernel_sgecp_lib4.c
new file mode 100644
index 0000000..de5b704
--- /dev/null
+++ b/kernel/c99/kernel_sgecp_lib4.c
@@ -0,0 +1,1148 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgesc_4_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+ A[2+bs*0] *= alpha;
+ A[3+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_3_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+ A[2+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_2_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_1_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[1+bs*0];
+ B[1+bs*0] = A0[2+bs*0];
+ B[2+bs*0] = A0[3+bs*0];
+ B[3+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ B[2+bs*1] = A[2+bs*1];
+ B[3+bs*1] = A[3+bs*1];
+
+ B[3+bs*2] = A[3+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[1+bs*0];
+ B[1+bs*0] = A0[2+bs*0];
+ B[2+bs*0] = A0[3+bs*0];
+ B[3+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+0*bs] = A0[2+0*bs];
+ B[2+0*bs] = A0[3+0*bs];
+ B[3+0*bs] = A1[0+0*bs];
+
+ B[2+1*bs] = A0[3+1*bs];
+ B[3+1*bs] = A1[0+1*bs];
+
+ B[3+2*bs] = A1[0+2*bs];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle}
+
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ B[2+bs*1] = A1[0+bs*1];
+ B[3+bs*1] = A1[1+bs*1];
+
+ B[3+bs*2] = A1[1+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ B[2+bs*1] = A1[1+bs*1];
+ B[3+bs*1] = A1[2+bs*1];
+
+ B[3+bs*2] = A1[2+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ B[2+bs*1] = A[2+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ B[2+bs*1] = A1[0+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ B[2+bs*1] = A1[1+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 1x1 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 1x1 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgemm_4x4_lib4.c b/kernel/c99/kernel_sgemm_4x4_lib4.c
new file mode 100644
index 0000000..243d559
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_4x4_lib4.c
@@ -0,0 +1,6094 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ float
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
+void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(kmax, alpha, A, B, sdb, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+// c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+// c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+// c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+// c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+// c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+// c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ k = 0;
+
+ // k = 0
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 1
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 2
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ float *D1;
+
+ int k;
+
+ B += offsetB;
+
+ k = 0;
+
+ if(offsetB==0)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==1)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 5
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else // if(offetB==3)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ store:
+
+ c_00 = alpha[0]*c_00;
+ c_10 = alpha[0]*c_10;
+ c_20 = alpha[0]*c_20;
+ c_30 = alpha[0]*c_30;
+
+ c_01 = alpha[0]*c_01;
+ c_11 = alpha[0]*c_11;
+ c_21 = alpha[0]*c_21;
+ c_31 = alpha[0]*c_31;
+
+ c_02 = alpha[0]*c_02;
+ c_12 = alpha[0]*c_12;
+ c_22 = alpha[0]*c_22;
+ c_32 = alpha[0]*c_32;
+
+ c_03 = alpha[0]*c_03;
+ c_13 = alpha[0]*c_13;
+ c_23 = alpha[0]*c_23;
+ c_33 = alpha[0]*c_33;
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+// c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+// c_02 = C[0+bs*2] + c_02;
+// c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+// c_03 = C[0+bs*3] + c_03;
+// c_13 = C[1+bs*3] + c_13;
+// c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(c_00>0)
+ {
+ c_00 = sqrt(c_00);
+ tmp = 1.0/c_00;
+ }
+ else
+ {
+ c_00 = 0.0;
+ tmp = 0.0;
+ }
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ c_11 -= c_10 * c_10;
+ c_21 -= c_20 * c_10;
+ c_31 -= c_30 * c_10;
+ if(c_11>0)
+ {
+ c_11 = sqrt(c_11);
+ tmp = 1.0/c_11;
+ }
+ else
+ {
+ c_11 = 0.0;
+ tmp = 0.0;
+ }
+ c_21 *= tmp;
+ c_31 *= tmp;
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ c_22 -= c_20 * c_20;
+ c_32 -= c_30 * c_20;
+ c_22 -= c_21 * c_21;
+ c_32 -= c_31 * c_21;
+ if(c_22>0)
+ {
+ c_22 = sqrt(c_22);
+ tmp = 1.0/c_22;
+ }
+ else
+ {
+ c_22 = 0.0;
+ tmp = 0.0;
+ }
+ c_32 *= tmp;
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ c_33 -= c_30 * c_30;
+ c_33 -= c_31 * c_31;
+ c_33 -= c_32 * c_32;
+ if(c_33>0)
+ {
+ c_33 = sqrt(c_33);
+ tmp = 1.0/c_33;
+ }
+ else
+ {
+ c_33 = 0.0;
+ tmp = 0.0;
+ }
+ inv_diag_D[3] = tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+
+ if(kn>3)
+ {
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+ tmp = E[2+bs*3];
+ c_02 -= c_03 * tmp;
+ c_12 -= c_13 * tmp;
+ c_22 -= c_23 * tmp;
+ c_32 -= c_33 * tmp;
+ tmp = E[1+bs*3];
+ c_01 -= c_03 * tmp;
+ c_11 -= c_13 * tmp;
+ c_21 -= c_23 * tmp;
+ c_31 -= c_33 * tmp;
+ tmp = E[0+bs*3];
+ c_00 -= c_03 * tmp;
+ c_10 -= c_13 * tmp;
+ c_20 -= c_23 * tmp;
+ c_30 -= c_33 * tmp;
+ }
+
+ if(kn>2)
+ {
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+ tmp = E[1+bs*2];
+ c_01 -= c_02 * tmp;
+ c_11 -= c_12 * tmp;
+ c_21 -= c_22 * tmp;
+ c_31 -= c_32 * tmp;
+ tmp = E[0+bs*2];
+ c_00 -= c_02 * tmp;
+ c_10 -= c_12 * tmp;
+ c_20 -= c_22 * tmp;
+ c_30 -= c_32 * tmp;
+ }
+
+ if(kn>1)
+ {
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+ tmp = E[0+bs*1];
+ c_00 -= c_01 * tmp;
+ c_10 -= c_11 * tmp;
+ c_20 -= c_21 * tmp;
+ c_30 -= c_31 * tmp;
+ }
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // factorization
+
+ // first column
+ tmp = 1.0 / c_00;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ // second column
+ c_11 -= c_10 * c_01;
+ c_21 -= c_20 * c_01;
+ c_31 -= c_30 * c_01;
+
+ tmp = 1.0 / c_11;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ // third column
+ c_12 -= c_10 * c_02;
+ c_22 -= c_20 * c_02;
+ c_32 -= c_30 * c_02;
+
+ c_22 -= c_21 * c_12;
+ c_32 -= c_31 * c_12;
+
+ tmp = 1.0 / c_22;
+ c_32 *= tmp;
+
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ // fourth column
+ c_13 -= c_10 * c_03;
+ c_23 -= c_20 * c_03;
+ c_33 -= c_30 * c_03;
+
+ c_23 -= c_21 * c_13;
+ c_33 -= c_31 * c_13;
+
+ c_33 -= c_32 * c_23;
+
+ tmp = 1.0 / c_33;
+
+ inv_diag_D[3] = tmp;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_1, e_2, e_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solution
+
+ if(km==1)
+ goto store;
+
+ e_1 = E[1+bs*0];
+ e_2 = E[2+bs*0];
+ e_3 = E[3+bs*0];
+ c_10 -= e_1 * c_00;
+ c_20 -= e_2 * c_00;
+ c_30 -= e_3 * c_00;
+ c_11 -= e_1 * c_01;
+ c_21 -= e_2 * c_01;
+ c_31 -= e_3 * c_01;
+ c_12 -= e_1 * c_02;
+ c_22 -= e_2 * c_02;
+ c_32 -= e_3 * c_02;
+ c_13 -= e_1 * c_03;
+ c_23 -= e_2 * c_03;
+ c_33 -= e_3 * c_03;
+
+ if(km==2)
+ goto store;
+
+ e_2 = E[2+bs*1];
+ e_3 = E[3+bs*1];
+ c_20 -= e_2 * c_10;
+ c_30 -= e_3 * c_10;
+ c_21 -= e_2 * c_11;
+ c_31 -= e_3 * c_11;
+ c_22 -= e_2 * c_12;
+ c_32 -= e_3 * c_12;
+ c_23 -= e_2 * c_13;
+ c_33 -= e_3 * c_13;
+
+ if(km==3)
+ goto store;
+
+ e_3 = E[3+bs*2];
+ c_30 -= e_3 * c_20;
+ c_31 -= e_3 * c_21;
+ c_32 -= e_3 * c_22;
+ c_33 -= e_3 * c_23;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solve
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_10 *= e_00;
+ c_20 *= e_00;
+ c_30 *= e_00;
+
+ if(kn==1)
+ goto store;
+
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_01 -= c_00 * e_01;
+ c_11 -= c_10 * e_01;
+ c_21 -= c_20 * e_01;
+ c_31 -= c_30 * e_01;
+ c_01 *= e_11;
+ c_11 *= e_11;
+ c_21 *= e_11;
+ c_31 *= e_11;
+
+ if(kn==2)
+ goto store;
+
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_02 -= c_00 * e_02;
+ c_12 -= c_10 * e_02;
+ c_22 -= c_20 * e_02;
+ c_32 -= c_30 * e_02;
+ c_02 -= c_01 * e_12;
+ c_12 -= c_11 * e_12;
+ c_22 -= c_21 * e_12;
+ c_32 -= c_31 * e_12;
+ c_02 *= e_22;
+ c_12 *= e_22;
+ c_22 *= e_22;
+ c_32 *= e_22;
+
+ if(kn==3)
+ goto store;
+
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_03 -= c_00 * e_03;
+ c_13 -= c_10 * e_03;
+ c_23 -= c_20 * e_03;
+ c_33 -= c_30 * e_03;
+ c_03 -= c_01 * e_13;
+ c_13 -= c_11 * e_13;
+ c_23 -= c_21 * e_13;
+ c_33 -= c_31 * e_13;
+ c_03 -= c_02 * e_23;
+ c_13 -= c_12 * e_23;
+ c_23 -= c_22 * e_23;
+ c_33 -= c_32 * e_23;
+ c_03 *= e_33;
+ c_13 *= e_33;
+ c_23 *= e_33;
+ c_33 *= e_33;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+
+ // solve
+
+ if(km>3)
+ {
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_30 *= e_33;
+ c_31 *= e_33;
+ c_32 *= e_33;
+ c_33 *= e_33;
+ c_00 -= e_03 * c_30;
+ c_01 -= e_03 * c_31;
+ c_02 -= e_03 * c_32;
+ c_03 -= e_03 * c_33;
+ c_10 -= e_13 * c_30;
+ c_11 -= e_13 * c_31;
+ c_12 -= e_13 * c_32;
+ c_13 -= e_13 * c_33;
+ c_20 -= e_23 * c_30;
+ c_21 -= e_23 * c_31;
+ c_22 -= e_23 * c_32;
+ c_23 -= e_23 * c_33;
+ }
+
+ if(km>2)
+ {
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_20 *= e_22;
+ c_21 *= e_22;
+ c_22 *= e_22;
+ c_23 *= e_22;
+ c_00 -= e_02 * c_20;
+ c_01 -= e_02 * c_21;
+ c_02 -= e_02 * c_22;
+ c_03 -= e_02 * c_23;
+ c_10 -= e_12 * c_20;
+ c_11 -= e_12 * c_21;
+ c_12 -= e_12 * c_22;
+ c_13 -= e_12 * c_23;
+ }
+
+ if(km>1)
+ {
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_10 *= e_11;
+ c_11 *= e_11;
+ c_12 *= e_11;
+ c_13 *= e_11;
+ c_00 -= e_01 * c_10;
+ c_01 -= e_01 * c_11;
+ c_02 -= e_01 * c_12;
+ c_03 -= e_01 * c_13;
+ }
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_01 *= e_00;
+ c_02 *= e_00;
+ c_03 *= e_00;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
diff --git a/kernel/c99/kernel_sgemm_diag_lib4.c b/kernel/c99/kernel_sgemm_diag_lib4.c
new file mode 100644
index 0000000..93df707
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_diag_lib4.c
@@ -0,0 +1,1112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_0;
+ c_2 = a_2 * b_0;
+ c_3 = a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = a_0 * b_1;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_1;
+ c_3 = a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = a_0 * b_2;
+ c_1 = a_1 * b_2;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = a_0 * b_3;
+ c_1 = a_1 * b_3;
+ c_2 = a_2 * b_3;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1,
+ b_0, b_1,
+ c_0, c_1;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
diff --git a/kernel/c99/kernel_sgemv_4_lib4.c b/kernel/c99/kernel_sgemv_4_lib4.c
new file mode 100644
index 0000000..03975f4
--- /dev/null
+++ b/kernel/c99/kernel_sgemv_4_lib4.c
@@ -0,0 +1,1010 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ x_0 = x[1];
+
+ y_0 += A[0+bs*1] * x_0;
+ y_1 += A[1+bs*1] * x_0;
+ y_2 += A[2+bs*1] * x_0;
+ y_3 += A[3+bs*1] * x_0;
+
+ x_0 = x[2];
+
+ y_0 += A[0+bs*2] * x_0;
+ y_1 += A[1+bs*2] * x_0;
+ y_2 += A[2+bs*2] * x_0;
+ y_3 += A[3+bs*2] * x_0;
+
+ x_0 = x[3];
+
+ y_0 += A[0+bs*3] * x_0;
+ y_1 += A[1+bs*3] * x_0;
+ y_2 += A[2+bs*3] * x_0;
+ y_3 += A[3+bs*3] * x_0;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(k0<=0 & k1>3)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ if(k0<=0 & k1>0) z[0] = y_0;
+ if(k0<=1 & k1>1) z[1] = y_1;
+ if(k0<=2 & k1>2) z[2] = y_2;
+ if(k0<=3 & k1>3) z[3] = y_3;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
+ {
+
+ kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
+ {
+
+ kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
+ {
+
+ const int bs = 4;
+
+ int k, kend;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ if(offA!=0) // 1, 2, 3
+ {
+ kend = 4-offA<kmax ? 4-offA : kmax;
+ for(; k<kend; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+ A += bs*(sda-1);
+ }
+ for(; k<kmax-bs+1; k+=bs)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
+ {
+
+ kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
+ {
+
+ kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[1+bs*0] * x_0;
+ y_2 -= A[2+bs*0] * x_0;
+ y_3 -= A[3+bs*0] * x_0;
+
+ y_0 -= A[0+bs*1] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[2+bs*1] * x_1;
+ y_3 -= A[3+bs*1] * x_1;
+
+ y_0 -= A[0+bs*2] * x_2;
+ y_1 -= A[1+bs*2] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[3+bs*2] * x_2;
+
+ y_0 -= A[0+bs*3] * x_3;
+ y_1 -= A[1+bs*3] * x_3;
+ y_2 -= A[2+bs*3] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ float
+ a_00, a_10, a_20, a_30,
+ a_11, a_21, a_31;
+
+ // a_00
+ a_00 = inv_diag_A[0];
+ a_10 = A[1+bs*0];
+ a_20 = A[2+bs*0];
+ a_30 = A[3+bs*0];
+ y_0 *= a_00;
+ z[0] = y_0;
+ y_1 -= a_10 * y_0;
+ y_2 -= a_20 * y_0;
+ y_3 -= a_30 * y_0;
+
+ if(kn==1)
+ {
+ if(km==1)
+ return;
+ y[1] = y_1;
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_11
+ a_11 = inv_diag_A[1];
+ a_21 = A[2+bs*1];
+ a_31 = A[3+bs*1];
+ y_1 *= a_11;
+ z[1] = y_1;
+ y_2 -= a_21 * y_1;
+ y_3 -= a_31 * y_1;
+
+ if(kn==2)
+ {
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_22
+ a_00 = inv_diag_A[2];
+ a_10 = A[3+bs*2];
+ y_2 *= a_00;
+ z[2] = y_2;
+ y_3 -= a_10 * y_2;
+
+ if(kn==3)
+ {
+ if(km==3)
+ return;
+ y[3] = y_3;
+
+ return;
+ }
+
+ // a_33
+ a_11 = inv_diag_A[3];
+ y_3 *= a_11;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+ y_3 -= A[1+bs*3] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[2+bs*3] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_3 *= inv_diag_A[3];
+ z[3] = y_3;
+
+ y_2 -= A[3+bs*2] * y_3;
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+ y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0;
+
+ k = 3;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_3 = x[3];
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 3;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2;
+ y_1 -= A[2+bs*1]*y_2;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0;
+
+ k = 2;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 2;
+ x += 2;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0;
+
+ k = 1;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 1;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+/* y_1 += A[1+bs*0] * x_0;*/
+/* y_2 += A[2+bs*0] * x_0;*/
+/* y_3 += A[3+bs*0] * x_0;*/
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+/* y_2 += A[2+bs*1] * x_1;*/
+/* y_3 += A[3+bs*1] * x_1;*/
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+/* y_3 += A[3+bs*2] * x_2;*/
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ k=4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[2+bs*1] * x_1;
+ y_3 += A[3+bs*1] * x_1;
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[3+bs*2] * x_2;
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-4; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+/* y_0 += A[1+bs*0] * x_1;*/
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+/* y_0 += A[2+bs*0] * x_2;*/
+/* y_1 += A[2+bs*1] * x_2;*/
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+/* y_0 += A[3+bs*0] * x_3;*/
+/* y_1 += A[3+bs*1] * x_3;*/
+/* y_2 += A[3+bs*2] * x_3;*/
+ y_3 += A[3+bs*3] * x_3;
+
+// A += sda*bs;
+// x += 4;
+
+ // store_vs
+ store:
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
+ {
+
+ kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+ }
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgetrf_pivot_4_lib4.c b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..fdec8de
--- /dev/null
+++ b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
@@ -0,0 +1,786 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_s_aux.h"
+
+
+
+// C numbering, starting from 0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
+ {
+
+ int idamax, ii;
+ float tmp, amax;
+
+ p_idamax[0] = -1;
+ if(n<1)
+ return;
+
+ const int bs = 4;
+
+ int na = (bs - offset%bs)%bs;
+ na = n<na ? n : na;
+
+ amax = -1.0;
+ ii = 0;
+ if(na>0)
+ {
+ for( ; ii<na; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ tmp = fabs(pA[1]);
+ if(tmp>amax)
+ {
+ idamax = ii+1;
+ amax = tmp;
+ }
+ tmp = fabs(pA[2]);
+ if(tmp>amax)
+ {
+ idamax = ii+2;
+ amax = tmp;
+ }
+ tmp = fabs(pA[3]);
+ if(tmp>amax)
+ {
+ idamax = ii+3;
+ amax = tmp;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<n; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+
+ p_amax[0] = amax;
+ p_idamax[0] = idamax;
+
+ return;
+
+ }
+#endif
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ float
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ float
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+ sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ // second column
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ // third column
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+
+ // fourth column
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ float
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ float
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+
+ // find pivot & scale
+ sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ if(m>=4)
+ {
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {1,2,3}
+ {
+ if(m>1)
+ {
+ pA[1+bs*0] *= tmp0;
+ if(m>2)
+ pA[2+bs*0] *= tmp0;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+ return;
+
+ // second column
+
+ // correct
+ if(m>=4)
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*1];
+ tmp2 -= pA[2+bs*0] * u_01;
+ pA[2+bs*1] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ if(m>=4)
+ {
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ if(m>2)
+ pA[2+bs*1] *= tmp1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ if(n==2)
+ return;
+
+ // third column
+
+ // correct
+ if(m>=4)
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ if(m>2)
+ {
+ sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ if(m>=4)
+ {
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n<4)
+ return;
+
+ // fourth column
+
+ // correct
+ if(m>=4)
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ if(m>2)
+ {
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ }
+ }
+
+ if(m>3)
+ {
+ // find pivot & scale
+ sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_ssymv_4_lib4.c b/kernel/c99/kernel_ssymv_4_lib4.c
new file mode 100644
index 0000000..5512154
--- /dev/null
+++ b/kernel/c99/kernel_ssymv_4_lib4.c
@@ -0,0 +1,1025 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
+ {
+
+ kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ float *x_t = x_n;
+ float *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
+ {
+
+ kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
+
diff --git a/kernel/fma/Makefile b/kernel/fma/Makefile
new file mode 100644
index 0000000..d7be280
--- /dev/null
+++ b/kernel/fma/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS +=
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/fma/kernel_dgemm_4x4_lib4.S b/kernel/fma/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..a02f37d
--- /dev/null
+++ b/kernel/fma/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,3895 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ subl $4, %r10d
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r11
+ addq $128, %r12
+
+
+ cmpl $4, %r10d
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r12
+ addq $128, %r11
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ subl $1, %r10d
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ subl $4, %r10d
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r12
+ addq $128, %r11
+
+
+ cmpl $4, %r10d
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r12
+ addq $128, %r11
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ subl $1, %r10d
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+ addq $32, %r12
+ addq $32, %r11
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %xmm8
+ vmovapd 16(%r10), %xmm9
+ vmovddup 0(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovapd 32(%r10), %xmm8
+ vmovapd 48(%r10), %xmm9
+ vmovddup 32(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 40(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovapd 64(%r10), %xmm8
+ vmovapd 80(%r10), %xmm9
+ vmovddup 64(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 72(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ vmovddup 80(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovapd 96(%r10), %xmm8
+ vmovapd 112(%r10), %xmm9
+ vmovddup 96(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 104(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ vmovddup 112(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+ vmovddup 120(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ addq $32, %r11
+ vmovddup 8(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 8(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ addq $32, %r11
+ vmovddup 16(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 8(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ vmovddup 16(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+ addq $32, %r11
+ vmovddup 24(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+ // XXX nothing to blend
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // XXX nothing to blend
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+
+ vmovapd 0(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm0
+ vmovapd 16(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm1
+ vmovapd 32(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm2
+ vmovapd 48(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm3
+ vmovapd 64(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm4
+ vmovapd 80(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm5
+ vmovapd 96(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm6
+ vmovapd 112(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+
+ vmovapd 0(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm0
+ vmovapd 16(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm1
+ vmovapd 32(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm2
+ vmovapd 48(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm3
+ vmovapd 64(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm4
+ vmovapd 80(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm5
+ vmovapd 96(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm6
+ vmovapd 112(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %xmm15
+ vaddpd %xmm0, %xmm15, %xmm0
+ vmovapd 16(%r10), %xmm15
+ vaddpd %xmm1, %xmm15, %xmm1
+ vmovapd 32(%r10), %xmm15
+ vaddpd %xmm2, %xmm15, %xmm2
+ vmovapd 48(%r10), %xmm15
+ vaddpd %xmm3, %xmm15, %xmm3
+ vmovapd 64(%r10), %xmm15
+ vaddpd %xmm4, %xmm15, %xmm4
+ vmovapd 80(%r10), %xmm15
+ vaddpd %xmm5, %xmm15, %xmm5
+ vmovapd 96(%r10), %xmm15
+ vaddpd %xmm6, %xmm15, %xmm6
+ vmovapd 112(%r10), %xmm15
+ vaddpd %xmm7, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %xmm15, %xmm15, %xmm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ cmpl $2, %r11d
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vmulpd %xmm0, %xmm13, %xmm0
+ vmulpd %xmm1, %xmm13, %xmm1
+
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm0, %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm2
+ vfnmadd231pd %xmm1, %xmm13, %xmm3
+ vpermilpd $0x3, %xmm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ cmpl $3, %r11d
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vmulpd %xmm2, %xmm13, %xmm2
+ vmulpd %xmm3, %xmm13, %xmm3
+
+ jl 0f // ret
+
+ vpermilpd $0x0, %xmm1, %xmm13
+// vfnmadd231pd %xmm0, %xmm13, %xmm4
+ vfnmadd231pd %xmm1, %xmm13, %xmm5
+ vpermilpd $0x0, %xmm3, %xmm13
+// vfnmadd231pd %xmm2, %xmm13, %xmm4
+ vfnmadd231pd %xmm3, %xmm13, %xmm5
+ vmovaps %xmm5, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ cmpl $4, %r11d
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+// vmulpd %xmm4, %xmm13, %xmm4
+ vmulpd %xmm5, %xmm13, %xmm5
+
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm1, %xmm13
+// vfnmadd231pd %xmm0, %xmm13, %xmm6
+ vfnmadd231pd %xmm1, %xmm13, %xmm7
+ vpermilpd $0x3, %xmm3, %xmm13
+// vfnmadd231pd %xmm2, %xmm13, %xmm6
+ vfnmadd231pd %xmm3, %xmm13, %xmm7
+ vpermilpd $0x3, %xmm5, %xmm13
+// vfnmadd231pd %xmm4, %xmm13, %xmm6
+ vfnmadd231pd %xmm5, %xmm13, %xmm7
+ vpermilpd $0x3, %xmm7, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+// vmulpd %xmm6, %xmm13, %xmm6
+ vmulpd %xmm7, %xmm13, %xmm7
+
+ jmp 0f
+
+1:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 2b
+
+3:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 4b
+
+5:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 6b
+
+7:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovddup 0(%r11), %xmm13
+ vmulpd %xmm0, %xmm13, %xmm0
+ vmulpd %xmm1, %xmm13, %xmm1
+
+ vmovddup 8(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm2
+ vfnmadd231pd %xmm1, %xmm13, %xmm3
+ vmovddup 8(%r11), %xmm13
+ vmulpd %xmm2, %xmm13, %xmm2
+ vmulpd %xmm3, %xmm13, %xmm3
+
+ vmovddup 16(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm4
+ vfnmadd231pd %xmm1, %xmm13, %xmm5
+ vmovddup 48(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm4
+ vfnmadd231pd %xmm3, %xmm13, %xmm5
+ vmovddup 16(%r11), %xmm13
+ vmulpd %xmm4, %xmm13, %xmm4
+ vmulpd %xmm5, %xmm13, %xmm5
+
+ vmovddup 24(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm6
+ vfnmadd231pd %xmm1, %xmm13, %xmm7
+ vmovddup 56(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm6
+ vfnmadd231pd %xmm3, %xmm13, %xmm7
+ vmovddup 88(%r10), %xmm13
+ vfnmadd231pd %xmm4, %xmm13, %xmm6
+ vfnmadd231pd %xmm5, %xmm13, %xmm7
+ vmovddup 24(%r11), %xmm13
+ vmulpd %xmm6, %xmm13, %xmm6
+ vmulpd %xmm7, %xmm13, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovddup 0(%r11), %xmm13
+ cmpl $2, %r12d
+ vmulpd %xmm0, %xmm13, %xmm0
+ vmulpd %xmm1, %xmm13, %xmm1
+
+ jl 0f // ret
+
+ vmovddup 8(%r10), %xmm13
+ cmpl $3, %r12d
+ vfnmadd231pd %xmm0, %xmm13, %xmm2
+ vfnmadd231pd %xmm1, %xmm13, %xmm3
+ vmovddup 8(%r11), %xmm13
+ vmulpd %xmm2, %xmm13, %xmm2
+ vmulpd %xmm3, %xmm13, %xmm3
+
+ jl 0f // ret
+
+ vmovddup 16(%r10), %xmm13
+ cmpl $4, %r12d
+ vfnmadd231pd %xmm0, %xmm13, %xmm4
+ vfnmadd231pd %xmm1, %xmm13, %xmm5
+ vmovddup 48(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm4
+ vfnmadd231pd %xmm3, %xmm13, %xmm5
+ vmovddup 16(%r11), %xmm13
+ vmulpd %xmm4, %xmm13, %xmm4
+ vmulpd %xmm5, %xmm13, %xmm5
+
+ jl 0f // ret
+
+ vmovddup 24(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm6
+ vfnmadd231pd %xmm1, %xmm13, %xmm7
+ vmovddup 56(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm6
+ vfnmadd231pd %xmm3, %xmm13, %xmm7
+ vmovddup 88(%r10), %xmm13
+ vfnmadd231pd %xmm4, %xmm13, %xmm6
+ vfnmadd231pd %xmm5, %xmm13, %xmm7
+ vmovddup 24(%r11), %xmm13
+ vmulpd %xmm6, %xmm13, %xmm6
+ vmulpd %xmm7, %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+ vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+ vmovapd %xmm6, 96(%r10)
+ vmovapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// TODO use blendv instead
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ cmpl $2, %r12d
+ vmovsd %xmm0, 0(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovsd %xmm2, 32(%r10)
+ jl 4f // end
+ vmovsd %xmm4, 64(%r10)
+ je 4f // end
+ vmovsd %xmm6, 96(%r10)
+
+ jmp 4f
+
+0:
+ // km==2
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovapd %xmm2, 32(%r10)
+ jl 4f // end
+ vmovapd %xmm4, 64(%r10)
+ je 4f // end
+ vmovapd %xmm6, 96(%r10)
+
+ jmp 4f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovsd %xmm1, 16(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovapd %xmm2, 32(%r10)
+ vmovsd %xmm3, 48(%r10)
+ jl 4f // end
+ vmovapd %xmm4, 64(%r10)
+ vmovsd %xmm5, 80(%r10)
+ je 4f // end
+ vmovapd %xmm6, 96(%r10)
+ vmovsd %xmm7, 112(%r10)
+
+ jmp 4f
+
+2:
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+ jl 4f // end
+ vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+ je 4f // end
+ vmovapd %xmm6, 96(%r10)
+ vmovapd %xmm7, 112(%r10)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+// vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+// vmovapd %xmm6, 96(%r10)
+ vmovsd 112(%r10), %xmm15
+ vmovsd %xmm15, %xmm7, %xmm7
+ vmovapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ vmovsd %xmm0, 0(%r10)
+
+ jmp 3f
+
+0:
+ // km==2
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ jl 3f // end
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+
+ jmp 3f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovsd %xmm1, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+ vmovsd %xmm3, 48(%r10)
+ jl 3f // end
+// vmovapd %xmm4, 64(%r10)
+ vmovsd %xmm5, 80(%r10)
+
+ jmp 3f
+
+2:
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+ jl 3f // end
+// vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+ je 3f // end
+// vmovapd %xmm6, 96(%r10)
+ vmovsd 112(%r10), %xmm15
+ vmovsd %xmm15, %xmm7, %xmm7
+ vmovapd %xmm7, 112(%r10)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $4, %r11d // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/sse3/Makefile b/kernel/sse3/Makefile
new file mode 100644
index 0000000..dbc07d1
--- /dev/null
+++ b/kernel/sse3/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS +=
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/sse3/kernel_dgemm_4x4_lib4.S b/kernel/sse3/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..26f35b6
--- /dev/null
+++ b/kernel/sse3/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,6235 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+ movapd 0(%r12), %xmm10 // B[0]
+
+ xorpd %xmm11, %xmm11
+ movapd %xmm11, %xmm12
+ movapd %xmm11, %xmm13
+ movapd %xmm11, %xmm14
+ movapd %xmm11, %xmm15
+
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ addpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ addpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ addpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ addpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ addpd %xmm10, %xmm1
+ movapd 0(%r12), %xmm10 // B[0]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ cmpl $4, %r10d
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 16(%r11), %xmm9 // A[2]
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ addpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ addpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ addpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ addpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ addpd %xmm10, %xmm1
+// movapd 0(%r12), %xmm10 // B[0]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+// cmpl $4, %r10d
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+// movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+// movapd 16(%r11), %xmm9 // A[2]
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm3
+ addpd %xmm11, %xmm7
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+
+ // unroll 0
+ addpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $1, %r10d
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $32, %r12
+
+ addpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[0]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addq $32, %r11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[2]
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm3
+ addpd %xmm11, %xmm7
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+ movapd 0(%r12), %xmm10 // B[0]
+
+ xorpd %xmm11, %xmm11
+ movapd %xmm11, %xmm12
+ movapd %xmm11, %xmm13
+ movapd %xmm11, %xmm14
+ movapd %xmm11, %xmm15
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ subpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ subpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ subpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ subpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ subpd %xmm10, %xmm1
+ movapd 0(%r12), %xmm10 // B[0]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ cmpl $4, %r10d
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 16(%r11), %xmm9 // A[2]
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ subpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ subpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ subpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ subpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ subpd %xmm10, %xmm1
+// movapd 0(%r12), %xmm10 // B[0]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+// cmpl $4, %r10d
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+// movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+// movapd 16(%r11), %xmm9 // A[2]
+
+
+ // update accumulators
+ subpd %xmm14, %xmm3
+ subpd %xmm11, %xmm7
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+
+ // unroll 0
+ subpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $1, %r10d
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $32, %r12
+
+ subpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[0]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addq $32, %r11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[2]
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+ // update accumulators
+ subpd %xmm14, %xmm3
+ subpd %xmm11, %xmm7
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ xorpd %xmm11, %xmm11
+ movapd %xmm11, %xmm12
+ movapd %xmm11, %xmm13
+ movapd %xmm11, %xmm14
+ movapd %xmm11, %xmm15
+
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ movddup 0(%r12), %xmm10 // B[0]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 32(%r12), %xmm15 // B[4]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 64(%r12), %xmm14 // B[8]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 96(%r12), %xmm12 // B[12]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ movddup 8(%r12), %xmm10 // B[1]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 40(%r12), %xmm15 // B[5]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 72(%r12), %xmm14 // B[9]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 104(%r12), %xmm12 // B[13]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ movddup 16(%r12), %xmm10 // B[2]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ movddup 48(%r12), %xmm15 // B[6]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 80(%r12), %xmm14 // B[10]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 112(%r12), %xmm12 // B[14]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ movddup 24(%r12), %xmm10 // B[3]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 56(%r12), %xmm15 // B[7]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ movddup 88(%r12), %xmm14 // B[11]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 120(%r12), %xmm12 // B[15]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 16(%r11), %xmm9 // A[2]
+ addq %r13, %r12 // B += ...
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ movddup 0(%r12), %xmm10 // B[0]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 32(%r12), %xmm15 // B[4]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 64(%r12), %xmm14 // B[8]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 96(%r12), %xmm12 // B[12]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ movddup 8(%r12), %xmm10 // B[1]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 40(%r12), %xmm15 // B[5]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 72(%r12), %xmm14 // B[9]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 104(%r12), %xmm12 // B[13]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ movddup 16(%r12), %xmm10 // B[2]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ movddup 48(%r12), %xmm15 // B[6]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 80(%r12), %xmm14 // B[10]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 112(%r12), %xmm12 // B[14]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ movddup 24(%r12), %xmm10 // B[3]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 56(%r12), %xmm15 // B[7]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ movddup 88(%r12), %xmm14 // B[11]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 120(%r12), %xmm12 // B[15]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+// movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+// movapd 16(%r11), %xmm9 // A[2]
+ addq %r13, %r12 // B += ...
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $1, %r10d
+
+ movddup 32(%r12), %xmm15 // B[4]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 64(%r12), %xmm14 // B[8]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addq $32, %r11
+
+ movddup 96(%r12), %xmm12 // B[12]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addq $8, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 32(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 64(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 96(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ // unroll 1
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ // unroll 2
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ // unroll 3
+ movapd 96(%r11), %xmm8 // A[0]
+ movapd 112(%r11), %xmm9 // A[2]
+
+ movddup 24(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 56(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 88(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 120(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ // unroll 1
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ // unroll 2
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ // unroll 1
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ // unroll 2
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 32(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 64(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ // unroll 3
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 72(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 104(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ // unroll 4
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 112(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ // unroll 5
+ movapd 96(%r11), %xmm8 // A[0]
+ movapd 112(%r11), %xmm9 // A[2]
+
+ movddup 24(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 56(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 88(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 120(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ // unroll 1
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 32(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ // unroll 2
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 72(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ // unroll 3
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 112(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ // unroll 4
+ movapd 96(%r11), %xmm8 // A[0]
+ movapd 112(%r11), %xmm9 // A[2]
+
+ movddup 24(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 56(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 88(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 120(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ movapd 0(%r10), %xmm8
+ movapd 16(%r10), %xmm9
+ movddup 0(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+
+ movapd 32(%r10), %xmm8
+ movapd 48(%r10), %xmm9
+ movddup 32(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 40(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+
+ movapd 64(%r10), %xmm8
+ movapd 80(%r10), %xmm9
+ movddup 64(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 72(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ movddup 80(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+
+ movapd 96(%r10), %xmm8
+ movapd 112(%r10), %xmm9
+ movddup 96(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 104(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ movddup 112(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movddup 120(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ addq $32, %r11
+ movddup 8(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 8(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ addq $32, %r11
+ movddup 16(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 8(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ movddup 16(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ addq $32, %r11
+ movddup 24(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, %xmm8
+ movsd %xmm1, %xmm0
+ movsd %xmm8, %xmm1
+
+ movapd %xmm2, %xmm8
+ movsd %xmm3, %xmm2
+ movsd %xmm8, %xmm3
+
+ movapd %xmm4, %xmm8
+ movsd %xmm5, %xmm4
+ movsd %xmm8, %xmm5
+
+ movapd %xmm6, %xmm8
+ movsd %xmm7, %xmm6
+ movsd %xmm8, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+ mulpd %xmm15, %xmm4
+ mulpd %xmm15, %xmm5
+ mulpd %xmm15, %xmm6
+ mulpd %xmm15, %xmm7
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+ movapd 0(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm0
+ movapd 16(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm4
+ movapd 32(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm1
+ movapd 48(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm5
+ movapd 64(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm2
+ movapd 80(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm6
+ movapd 96(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm3
+ movapd 112(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0.0
+//
+// input arguments:
+// r10 <- alpha
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- alpha
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+ mulpd %xmm15, %xmm4
+ mulpd %xmm15, %xmm5
+ mulpd %xmm15, %xmm6
+ mulpd %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, %xmm8
+ movsd %xmm1, %xmm0
+ movsd %xmm8, %xmm1
+
+ movapd %xmm2, %xmm8
+ movsd %xmm3, %xmm2
+ movsd %xmm8, %xmm3
+
+ movapd %xmm4, %xmm8
+ movsd %xmm5, %xmm4
+ movsd %xmm8, %xmm5
+
+ movapd %xmm6, %xmm8
+ movsd %xmm7, %xmm6
+ movsd %xmm8, %xmm7
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+ mulpd %xmm15, %xmm4
+ mulpd %xmm15, %xmm5
+ mulpd %xmm15, %xmm6
+ mulpd %xmm15, %xmm7
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+ movapd 0(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm0
+ movapd 16(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm4
+ movapd 32(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm1
+ movapd 48(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm5
+ movapd 64(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm2
+ movapd 80(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm6
+ movapd 96(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm3
+ movapd 112(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, %xmm8
+ movsd %xmm1, %xmm0
+ movsd %xmm8, %xmm1
+
+ movapd %xmm2, %xmm8
+ movsd %xmm3, %xmm2
+ movsd %xmm8, %xmm3
+
+ movapd %xmm4, %xmm8
+ movsd %xmm5, %xmm4
+ movsd %xmm8, %xmm5
+
+ movapd %xmm6, %xmm8
+ movsd %xmm7, %xmm6
+ movsd %xmm8, %xmm7
+
+
+ movapd 0(%r10), %xmm15
+ addpd %xmm15, %xmm0
+ movapd 16(%r10), %xmm15
+ addpd %xmm15, %xmm4
+ movapd 32(%r10), %xmm15
+ addpd %xmm15, %xmm1
+ movapd 48(%r10), %xmm15
+ addpd %xmm15, %xmm5
+ movapd 64(%r10), %xmm15
+ addpd %xmm15, %xmm2
+ movapd 80(%r10), %xmm15
+ addpd %xmm15, %xmm6
+ movapd 96(%r10), %xmm15
+ addpd %xmm15, %xmm3
+ movapd 112(%r10), %xmm15
+ addpd %xmm15, %xmm7
+
+ ret
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ xorpd %xmm15, %xmm15 // 0.0
+
+ movsd %xmm0, %xmm13
+ ucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+2:
+ cmpl $2, %r11d
+ movsd %xmm12, 0(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm0
+ mulpd %xmm12, %xmm4
+
+ jl 0f // ret
+
+ movapd %xmm0, %xmm12
+ shufpd $0x3, %xmm12, %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm1
+ subpd %xmm13, %xmm5
+ movapd %xmm1, %xmm13
+ shufpd $0x3, %xmm13, %xmm13 // 0x1 ???
+ ucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+4:
+ cmpl $3, %r11d
+ movsd %xmm12, 8(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm1
+ mulpd %xmm12, %xmm5
+
+ jl 0f // ret
+
+ movddup %xmm4, %xmm12
+ movddup %xmm5, %xmm13
+ mulpd %xmm4, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm6
+ subpd %xmm13, %xmm6
+ movsd %xmm6, %xmm13
+ ucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+6:
+ cmpl $4, %r11d
+ movsd %xmm12, 16(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm6
+
+ jl 0f // ret
+
+ movapd %xmm4, %xmm12
+ movapd %xmm5, %xmm13
+ movapd %xmm6, %xmm14
+ shufpd $0x3, %xmm12, %xmm12
+ shufpd $0x3, %xmm13, %xmm13
+ shufpd $0x3, %xmm14, %xmm14
+ mulpd %xmm4, %xmm12
+ mulpd %xmm5, %xmm13
+ mulpd %xmm6, %xmm14
+ subpd %xmm12, %xmm7
+ subpd %xmm13, %xmm7
+ subpd %xmm14, %xmm7
+ movapd %xmm7, %xmm13
+ shufpd $0x3, %xmm13, %xmm13
+ ucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+8:
+ movsd %xmm12, 24(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm7
+
+ jmp 0f
+
+1:
+ xorpd %xmm12, %xmm12
+ jmp 2b
+
+3:
+ xorpd %xmm12, %xmm12
+ jmp 4b
+
+5:
+ xorpd %xmm12, %xmm12
+ jmp 6b
+
+7:
+ xorpd %xmm12, %xmm12
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ movddup 0(%r11), %xmm13
+ mulpd %xmm13, %xmm0
+ mulpd %xmm13, %xmm4
+
+ movddup 8(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm13
+ mulpd %xmm4, %xmm12
+ subpd %xmm13, %xmm1
+ subpd %xmm12, %xmm5
+ movddup 8(%r11), %xmm13
+ mulpd %xmm13, %xmm1
+ mulpd %xmm13, %xmm5
+
+ movddup 16(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 48(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 16(%r11), %xmm13
+ mulpd %xmm13, %xmm2
+ mulpd %xmm13, %xmm6
+
+ movddup 24(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 56(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 88(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm2, %xmm12
+ mulpd %xmm6, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 24(%r11), %xmm13
+ mulpd %xmm13, %xmm3
+ mulpd %xmm13, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ movddup 0(%r11), %xmm13
+ cmpl $2, %r12d
+ mulpd %xmm13, %xmm0
+ mulpd %xmm13, %xmm4
+
+ jl 0f // ret
+
+ movddup 8(%r10), %xmm13
+ cmpl $3, %r12d
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm13
+ mulpd %xmm4, %xmm12
+ subpd %xmm13, %xmm1
+ subpd %xmm12, %xmm5
+ movddup 8(%r11), %xmm13
+ mulpd %xmm13, %xmm1
+ mulpd %xmm13, %xmm5
+
+ jl 0f // ret
+
+ movddup 16(%r10), %xmm13
+ cmpl $4, %r12d
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 48(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 16(%r11), %xmm13
+ mulpd %xmm13, %xmm2
+ mulpd %xmm13, %xmm6
+
+ jl 0f // ret
+
+ movddup 24(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 56(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 88(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm2, %xmm12
+ mulpd %xmm6, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 24(%r11), %xmm13
+ mulpd %xmm13, %xmm3
+ mulpd %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+ movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+ movapd %xmm3, 96(%r10)
+ movapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ movsd %xmm0, 0(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movsd %xmm1, 32(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movsd %xmm2, 64(%r10)
+ je 4f // end
+ movsd %xmm3, 96(%r10)
+
+ jmp 4f
+
+0:
+ // km==2
+ movapd %xmm0, 0(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movapd %xmm1, 32(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movapd %xmm2, 64(%r10)
+ je 4f // end
+ movapd %xmm3, 96(%r10)
+
+ jmp 4f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ movapd %xmm0, 0(%r10)
+ movsd %xmm4, 16(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movapd %xmm1, 32(%r10)
+ movsd %xmm5, 48(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movapd %xmm2, 64(%r10)
+ movsd %xmm6, 80(%r10)
+ je 4f // end
+ movapd %xmm3, 96(%r10)
+ movsd %xmm7, 112(%r10)
+
+ jmp 4f
+
+2:
+ // km==4
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+ je 4f // end
+ movapd %xmm3, 96(%r10)
+ movapd %xmm7, 112(%r10)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// xmm0 <-
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// xmm0 <-
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // masks computation ???
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %xmm1, %xmm0
+ vmovapd %xmm5, %xmm4
+ vmovapd %xmm2, %xmm1
+ vmovapd %xmm6, %xmm5
+ vmovapd %xmm3, %xmm2
+ vmovapd %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %xmm1, %xmm0
+ vmovapd %xmm5, %xmm4
+ vmovapd %xmm2, %xmm1
+ vmovapd %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %xmm1, %xmm0
+ vmovapd %xmm5, %xmm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+
+ cmpl $0, %r10d
+ jg 0f
+
+ ///////////////
+ // offset==0 //
+ ///////////////
+
+ cmpl $0, %r13d
+ jle 4f
+
+ cmpl $1, %r13d
+ jg 5f
+
+ movsd 0(%r11), %xmm8
+ movsd %xmm8, %xmm0
+ movsd 32(%r11), %xmm8
+ movsd %xmm8, %xmm1
+ movsd 64(%r11), %xmm8
+ movsd %xmm8, %xmm2
+ movsd 96(%r11), %xmm8
+ movsd %xmm8, %xmm3
+
+ jmp 4f
+
+5:
+
+ cmpl $2, %r13d
+ jg 5f
+
+ movapd 0(%r11), %xmm0
+ movapd 32(%r11), %xmm1
+ movapd 64(%r11), %xmm2
+ movapd 96(%r11), %xmm3
+
+ jmp 4f
+
+5:
+
+ cmpl $3, %r13d
+ jg 5f
+
+ movapd 0(%r11), %xmm0
+ movsd 16(%r11), %xmm8
+ movsd %xmm8, %xmm4
+ movapd 32(%r11), %xmm1
+ movsd 48(%r11), %xmm8
+ movsd %xmm8, %xmm5
+ movapd 64(%r11), %xmm2
+ movsd 80(%r11), %xmm8
+ movsd %xmm8, %xmm6
+ movapd 96(%r11), %xmm3
+ movsd 112(%r11), %xmm8
+ movsd %xmm8, %xmm7
+
+ jmp 4f
+
+5:
+
+ movapd 0(%r11), %xmm0
+ movapd 16(%r11), %xmm4
+ movapd 32(%r11), %xmm1
+ movapd 48(%r11), %xmm5
+ movapd 64(%r11), %xmm2
+ movapd 80(%r11), %xmm6
+ movapd 96(%r11), %xmm3
+ movapd 112(%r11), %xmm7
+
+4:
+ cmpl $2, %r14d
+ jg 5f
+ je 4f
+
+ // km==1
+ movsd %xmm0, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movsd %xmm1, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movsd %xmm2, 64(%r11)
+ je 3f // end
+ movsd %xmm3, 96(%r11)
+
+ jmp 3f
+
+4:
+ // km==2
+ movapd %xmm0, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movapd %xmm1, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movapd %xmm2, 64(%r11)
+ je 3f // end
+ movapd %xmm3, 96(%r11)
+
+ jmp 3f
+
+5:
+ cmpl $3, %r14d
+ jg 6f
+
+ // km==3
+ movapd %xmm0, 0(%r11)
+ movsd %xmm4, 16(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movapd %xmm1, 32(%r11)
+ movsd %xmm5, 48(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movapd %xmm2, 64(%r11)
+ movsd %xmm6, 80(%r11)
+ je 3f // end
+ movapd %xmm3, 96(%r11)
+ movsd %xmm7, 112(%r11)
+
+ jmp 3f
+
+6:
+ // km==4
+ movapd %xmm0, 0(%r11)
+ movapd %xmm4, 16(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movapd %xmm1, 32(%r11)
+ movapd %xmm5, 48(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movapd %xmm2, 64(%r11)
+ movapd %xmm6, 80(%r11)
+ je 3f // end
+ movapd %xmm3, 96(%r11)
+ movapd %xmm7, 112(%r11)
+
+ jmp 3f
+
+0:
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $1, %r10d
+ jg 1f
+
+ ///////////////
+ // offset==1 //
+ ///////////////
+
+ // TODO
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ ///////////////
+ // offset==2 //
+ ///////////////
+
+ // TODO
+
+ jmp 3f
+
+2:
+
+ ///////////////
+ // offset==3 //
+ ///////////////
+
+ // TODO
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+// movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+// movapd %xmm3, 96(%r10)
+ movsd 112(%r10), %xmm15
+ movsd %xmm15, %xmm7
+ movapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ movsd %xmm0, 0(%r10)
+
+ jmp 3f
+
+0:
+ // km==2
+ cmpl $2, %r12d
+ movapd %xmm0, 0(%r10)
+ jl 3f // end
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+
+ jmp 3f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ cmpl $2, %r12d
+ movapd %xmm0, 0(%r10)
+ movsd %xmm4, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+ movsd %xmm5, 48(%r10)
+ jl 3f // end
+// movapd %xmm2, 64(%r10)
+ movsd %xmm6, 80(%r10)
+
+ jmp 3f
+
+2:
+ // km==3
+ cmpl $2, %r12d
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+ jl 3f // end
+// movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+ je 3f // end
+// movapd %xmm3, 96(%r10)
+ movsd 112(%r10), %xmm15
+ movsd %xmm15, %xmm7
+ movapd %xmm7, 112(%r10)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+#if 0 //
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+#else //
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+#endif //
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_vs_lib4
+ .type kernel_dgemm_nn_4x4_vs_lib4, @function
+kernel_dgemm_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_vs_lib4
+_kernel_dgemm_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_vs_lib4
+ .def kernel_dgemm_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_vs_lib4, .-kernel_dgemm_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $4, %r11d // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/lib/dummy.txt b/lib/dummy.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/dummy.txt
diff --git a/test_problems/CMakeLists.txt b/test_problems/CMakeLists.txt
new file mode 100644
index 0000000..77becb1
--- /dev/null
+++ b/test_problems/CMakeLists.txt
@@ -0,0 +1,32 @@
+###################################################################################################
+# #
+# This file is part of HPIPM. #
+# #
+# HPIPM -- High Performance Interior Point Method. #
+# Copyright (C) 2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+add_executable(d_blas test_blas_d.c)
+target_link_libraries(d_blas blasfeo m)
+
+add_executable(s_blas test_blas_s.c)
+target_link_libraries(s_blas blasfeo m)
diff --git a/test_problems/Makefile b/test_problems/Makefile
new file mode 100644
index 0000000..f2e4741
--- /dev/null
+++ b/test_problems/Makefile
@@ -0,0 +1,67 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = /opt/netlib/liblapack.a /opt/blis/lib/libblis.a -lgfortran -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+ifeq ($(REF_BLAS), ATLAS)
+LIBS = /opt/atlas/lib/liblapack.a /opt/atlas/lib/libcblas.a /opt/atlas/lib/libf77blas.a /opt/atlas/lib/libatlas.a -lgfortran -lm
+endif
+
+#ifneq ($(NUM_THREAD), 1)
+#LIBS += -pthread
+#endif
+
+OBJS_TEST = test_blas_d.o
+#OBJS_TEST = test_blas_s.o
+#OBJS_TEST = test_d_strmat.o
+#OBJS_TEST = test_s_strmat.o
+#OBJS_TEST = kernel_assembly.o test_assembly.o
+
+obj: $(OBJS_TEST)
+ $(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+clean:
+ rm -f *.o
+ rm -f test.out
+ rm -f libblasfeo.a
+
diff --git a/test_problems/cpu_freq.h b/test_problems/cpu_freq.h
new file mode 100644
index 0000000..30320fc
--- /dev/null
+++ b/test_problems/cpu_freq.h
@@ -0,0 +1,31 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifndef GHZ_MAX
+#define GHZ_MAX 3.6
+#endif
diff --git a/test_problems/kernel_assembly.S b/test_problems/kernel_assembly.S
new file mode 100644
index 0000000..b393e0d
--- /dev/null
+++ b/test_problems/kernel_assembly.S
@@ -0,0 +1,633 @@
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+
+// cmpl $3, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ subl $1, %r10d
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovupd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovupd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+ vmovupd %ymm2, 64(%r10)
+ vmovupd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4_test
+#if defined(OS_LINUX)
+ .type kernel_dgemm_nt_4x4_lib4_test, @function
+#else // OS_WINDOWS
+ .def kernel_dgemm_nt_4x4_lib4_test; .scl 2; .type 32; .endef
+#endif
+kernel_dgemm_nt_4x4_lib4_test:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4_test
+_kernel_dgemm_nt_4x4_lib4_test:
+#endif
+
+ // prologue
+
+ subq $STACKSIZE, %rsp
+ movq %rbx, (%rsp)
+ movq %rbp, 8(%rsp)
+ movq %r12, 16(%rsp)
+ movq %r13, 24(%rsp)
+ movq %r14, 32(%rsp)
+ movq %r15, 40(%rsp)
+#if defined(OS_WINDOWS)
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
+#endif
+
+ vzeroupper
+
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+// movq ARG6, %rax
+// movq STACKSIZE + 48(%rsp), %rax
+
+
+ // epilogue
+
+ vzeroupper
+
+ movq (%rsp), %rbx
+ movq 8(%rsp), %rbp
+ movq 16(%rsp), %r12
+ movq 24(%rsp), %r13
+ movq 32(%rsp), %r14
+ movq 40(%rsp), %r15
+#if defined(OS_WINDOWS)
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
+#endif
+ addq $STACKSIZE, %rsp
+
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4_test, .-kernel_dgemm_nt_4x4_lib4_test
+#endif
+
+
diff --git a/test_problems/results/dummy.txt b/test_problems/results/dummy.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test_problems/results/dummy.txt
diff --git a/test_problems/test_assembly.c b/test_problems/test_assembly.c
new file mode 100644
index 0000000..3a07a13
--- /dev/null
+++ b/test_problems/test_assembly.c
@@ -0,0 +1,59 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+int main()
+ {
+
+ printf("\ntest assembly\n");
+
+ int ii;
+
+ int n = 12;
+
+ double *A; d_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+ d_print_mat(n, n, A, n);
+
+ double *B; d_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+ d_print_mat(n, n, B, n);
+
+ struct d_strmat sA;
+ d_allocate_strmat(n, n, &sA);
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ d_print_strmat(n, n, &sA, 0, 0);
+
+ struct d_strmat sB;
+ d_allocate_strmat(n, n, &sB);
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ struct d_strmat sD;
+ d_allocate_strmat(n, n, &sD);
+
+ struct d_strmat sC;
+ d_allocate_strmat(n, n, &sC);
+
+ double alpha = 1.0;
+ double beta = 0.0;
+ int ret = kernel_dgemm_nt_4x4_lib4_test(n, &alpha, sB.pA, sA.pA, &beta, sB.pA, sD.pA);
+ d_print_strmat(n, n, &sD, 0, 0);
+// printf("\n%ld %ld\n", (long long) n, ret);
+// printf("\n%ld %ld\n", (long long) &alpha, ret);
+// printf("\n%ld %ld\n", (long long) sA.pA, ret);
+// printf("\n%ld %ld\n", (long long) sB.pA, ret);
+// printf("\n%ld %ld\n", (long long) &beta, ret);
+// printf("\n%ld %ld\n", (long long) sC.pA, ret);
+// printf("\n%ld %ld\n", (long long) sD.pA, ret);
+
+ return 0;
+
+ }
diff --git a/test_problems/test_blas_d.c b/test_problems/test_blas_d.c
new file mode 100644
index 0000000..1e71494
--- /dev/null
+++ b/test_problems/test_blas_d.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//#include <xmmintrin.h> // needed to flush to zero sub-normals with _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON); in the main()
+//#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+#ifndef D_PS
+#define D_PS 1
+#endif
+#ifndef D_NC
+#define D_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+ {
+
+#if defined(REF_BLAS_OPENBLAS)
+ openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+ omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+ mkl_set_num_threads(1);
+#endif
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
+//#endif
+
+ printf("\n");
+ printf("\n");
+ printf("\n");
+
+ printf("BLAS performance test - double precision\n");
+ printf("\n");
+
+ // maximum frequency of the processor
+ const float GHz_max = GHZ_MAX;
+ printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+ printf("\n");
+
+ // maximum flops per cycle, double precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+ const float flops_max = 16;
+ printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ const float flops_max = 8;
+ printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+ const float flops_max = 4;
+ printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+ const float flops_max = 8;
+ printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ const float flops_max = 4;
+ printf("Testing BLAS version for NEONv2 instruction set, 64 bit (optimized for ARM Cortex A57): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+ const float flops_max = 2;
+ printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+ const float flops_max = 2;
+ printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+
+// FILE *f;
+// f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+// fprintf(f, "C = 'd_x64_intel_haswell';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+// fprintf(f, "C = 'd_x64_intel_sandybridge';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+// fprintf(f, "C = 'd_x64_intel_core';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+// fprintf(f, "C = 'd_x64_amd_bulldozer';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+// fprintf(f, "C = 'd_armv8a_arm_cortex_a57';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+// fprintf(f, "C = 'd_armv7a_arm_cortex_a15';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+// fprintf(f, "C = 'd_generic';\n");
+// fprintf(f, "\n");
+#endif
+
+// fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+// fprintf(f, "\n");
+
+// fprintf(f, "B = [\n");
+
+
+
+ int i, j, rep, ll;
+
+ const int bsd = D_PS;
+ const int ncd = D_NC;
+
+/* int info = 0;*/
+
+ printf("\nn\t dgemm_blasfeo\t dgemm_blas\n");
+ printf("\nn\t Gflops\t %%\t Gflops\n\n");
+
+#if 1
+ int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+ int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+
+// for(ll=0; ll<24; ll++)
+ for(ll=0; ll<75; ll++)
+// for(ll=0; ll<115; ll++)
+// for(ll=0; ll<120; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = nnrep[ll];
+// int n = ll+1;
+// int nrep = nnrep[0];
+// n = n<12 ? 12 : n;
+// n = n<8 ? 8 : n;
+
+#else
+ int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+
+ for(ll=0; ll<24; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = 40000; //nnrep[ll];
+#endif
+
+ double *A; d_zeros(&A, n, n);
+ double *B; d_zeros(&B, n, n);
+ double *C; d_zeros(&C, n, n);
+ double *M; d_zeros(&M, n, n);
+
+ char c_n = 'n';
+ char c_l = 'l';
+ char c_r = 'r';
+ char c_t = 't';
+ char c_u = 'u';
+ int i_1 = 1;
+ int i_t;
+ double d_1 = 1;
+ double d_0 = 0;
+
+ for(i=0; i<n*n; i++)
+ A[i] = i;
+
+ for(i=0; i<n; i++)
+ B[i*(n+1)] = 1;
+
+ for(i=0; i<n*n; i++)
+ M[i] = 1;
+
+ int n2 = n*n;
+ double *B2; d_zeros(&B2, n, n);
+ for(i=0; i<n*n; i++)
+ B2[i] = 1e-15;
+ for(i=0; i<n; i++)
+ B2[i*(n+1)] = 1;
+
+ int pnd = ((n+bsd-1)/bsd)*bsd;
+ int cnd = ((n+ncd-1)/ncd)*ncd;
+ int cnd2 = 2*((n+ncd-1)/ncd)*ncd;
+
+ double *x; d_zeros_align(&x, pnd, 1);
+ double *y; d_zeros_align(&y, pnd, 1);
+ double *x2; d_zeros_align(&x2, pnd, 1);
+ double *y2; d_zeros_align(&y2, pnd, 1);
+ double *diag; d_zeros_align(&diag, pnd, 1);
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ for(i=0; i<pnd; i++) x[i] = 1;
+ for(i=0; i<pnd; i++) x2[i] = 1;
+
+ // matrix struct
+#if 0
+ struct d_strmat sA; d_allocate_strmat(n+4, n+4, &sA);
+ struct d_strmat sB; d_allocate_strmat(n+4, n+4, &sB);
+ struct d_strmat sC; d_allocate_strmat(n+4, n+4, &sC);
+ struct d_strmat sD; d_allocate_strmat(n+4, n+4, &sD);
+ struct d_strmat sE; d_allocate_strmat(n+4, n+4, &sE);
+#else
+ struct d_strmat sA; d_allocate_strmat(n, n, &sA);
+ struct d_strmat sB; d_allocate_strmat(n, n, &sB);
+ struct d_strmat sB2; d_allocate_strmat(n, n, &sB2);
+ struct d_strmat sB3; d_allocate_strmat(n, n, &sB3);
+ struct d_strmat sC; d_allocate_strmat(n, n, &sC);
+ struct d_strmat sD; d_allocate_strmat(n, n, &sD);
+ struct d_strmat sE; d_allocate_strmat(n, n, &sE);
+#endif
+ struct d_strvec sx; d_allocate_strvec(n, &sx);
+ struct d_strvec sy; d_allocate_strvec(n, &sy);
+ struct d_strvec sz; d_allocate_strvec(n, &sz);
+
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ d_cvt_mat2strmat(n, n, B2, n, &sB2, 0, 0);
+ d_cvt_vec2strvec(n, x, &sx, 0);
+ int ii;
+ for(ii=0; ii<n; ii++)
+ {
+ DMATEL_LIBSTR(&sB3, ii, ii) = 1.0;
+// DMATEL_LIBSTR(&sB3, n-1, ii) = 1.0;
+ DMATEL_LIBSTR(&sB3, ii, n-1) = 1.0;
+ DVECEL_LIBSTR(&sx, ii) = 1.0;
+ }
+// d_print_strmat(n, n, &sB3, 0, 0);
+// if(n==20) return;
+
+ int qr_work_size = 0;//dgeqrf_work_size_libstr(n, n);
+ void *qr_work;
+ v_zeros_align(&qr_work, qr_work_size);
+
+ int lq_work_size = 0;//dgelqf_work_size_libstr(n, n);
+ void *lq_work;
+ v_zeros_align(&lq_work, lq_work_size);
+
+ // create matrix to pivot all the time
+// dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+ double *dummy;
+
+ int info;
+
+ /* timing */
+ struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+ /* warm up */
+ for(rep=0; rep<nrep; rep++)
+ {
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+ }
+
+ double alpha = 1.0;
+ double beta = 0.0;
+
+ gettimeofday(&tv0, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+
+// dgemm_nt_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+// dgemm_nn_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+// dsyrk_nt_l_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 1.0, pC, cnd, pD, cnd);
+// dtrmm_nt_ru_lib(n, n, pA, cnd, pB, cnd, 0, pC, cnd, pD, cnd);
+// dpotrf_nt_l_lib(n, n, pB, cnd, pD, cnd, diag);
+// dsyrk_dpotrf_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd, diag);
+// dsyrk_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd);
+// dpotrf_nt_l_lib(n, n, pD, cnd, pD, cnd, diag);
+// dgetrf_nn_nopivot_lib(n, n, pB, cnd, pB, cnd, diag);
+// dgetrf_nn_lib(n, n, pB, cnd, pB, cnd, diag, ipiv);
+// dtrsm_nn_ll_one_lib(n, n, pD, cnd, pB, cnd, pB, cnd);
+// dtrsm_nn_lu_inv_lib(n, n, pD, cnd, diag, pB, cnd, pB, cnd);
+ }
+
+ gettimeofday(&tv1, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+// kernel_dgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dgemm_nt_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dsyrk_nt_l_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_dgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_dger4_12_sub_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger4_sub_12r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger4_sub_8r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger12_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+// kernel_dger8_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+// kernel_dger4_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+// kernel_dger2_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+// kernel_dger4_sub_8c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger4_sub_4c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dgemm_nn_4x12_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_dgemm_nn_4x4_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dsyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dsyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dpotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+// dpotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+// dgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+// dgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+// dgeqrf_libstr(n, n, &sC, 0, 0, &sD, 0, 0, qr_work);
+// dcolin_libstr(n, &sx, 0, &sB3, 0, n-1);
+// dgelqf_libstr(n, n, &sB3, 0, 0, &sB3, 0, 0, lq_work);
+// dtrmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+// dtrmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dtrsm_rltn_libstr(n, n, 1.0, &sB2, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+// dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// dgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// dsymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// dgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+ }
+
+// d_print_strmat(n, n, &sD, 0, 0);
+
+ gettimeofday(&tv2, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+// dgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// dpotrf_(&c_l, &n, B2, &n, &info);
+// dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+// dtrmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+// dgetrf_(&n, &n, B2, &n, ipiv, &info);
+// dtrsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+// dtrsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+// dtrtri_(&c_l, &c_n, &n, B2, &n, &info);
+// dlauum_(&c_l, &n, B, &n, &info);
+// dgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+// dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+// dtrmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// dtrsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// dsymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+// for(i=0; i<n; i++)
+// {
+// i_t = n-i;
+// dcopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+// }
+// dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+// dpotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+// dgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// dsyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+// dtrmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+// dpotrf_(&c_l, &n77, B, &n77, &info);
+// dtrtri_(&c_l, &c_n, &n77, B, &n77, &info);
+// dlauum_(&c_l, &n77, B, &n77, &info);
+#endif
+ }
+
+ gettimeofday(&tv3, NULL); // stop
+
+ float Gflops_max = flops_max * GHz_max;
+
+// float flop_operation = 4*16.0*2*n; // kernel 12x4
+// float flop_operation = 3*16.0*2*n; // kernel 12x4
+// float flop_operation = 2*16.0*2*n; // kernel 8x4
+// float flop_operation = 1*16.0*2*n; // kernel 4x4
+// float flop_operation = 0.5*16.0*2*n; // kernel 2x4
+
+ float flop_operation = 2.0*n*n*n; // dgemm
+// float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+// float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+// float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+// float flop_operation = 4.0/3.0*n*n*n; // dgeqrf
+// float flop_operation = 2.0*n*n; // dgemv dsymv
+// float flop_operation = 1.0*n*n; // dtrmv dtrsv
+// float flop_operation = 4.0*n*n; // dgemv_nt
+
+// float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+ float time_hpmpc = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_blasfeo = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_blas = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ float Gflops_hpmpc = 1e-9*flop_operation/time_hpmpc;
+ float Gflops_blasfeo = 1e-9*flop_operation/time_blasfeo;
+ float Gflops_blas = 1e-9*flop_operation/time_blas;
+
+
+// printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+ printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+
+ d_free(A);
+ d_free(B);
+ d_free(B2);
+ d_free(M);
+ d_free_align(x);
+ d_free_align(y);
+ d_free_align(x2);
+ d_free_align(y2);
+ int_free(ipiv);
+ free(qr_work);
+ free(lq_work);
+
+ d_free_strmat(&sA);
+ d_free_strmat(&sB);
+ d_free_strmat(&sB2);
+ d_free_strmat(&sB3);
+ d_free_strmat(&sC);
+ d_free_strmat(&sD);
+ d_free_strmat(&sE);
+ d_free_strvec(&sx);
+ d_free_strvec(&sy);
+ d_free_strvec(&sz);
+
+ }
+
+ printf("\n");
+
+// fprintf(f, "];\n");
+// fclose(f);
+
+ return 0;
+
+ }
diff --git a/test_problems/test_blas_s.c b/test_problems/test_blas_s.c
new file mode 100644
index 0000000..3ea9f11
--- /dev/null
+++ b/test_problems/test_blas_s.c
@@ -0,0 +1,454 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+#ifndef S_PS
+#define S_PS 1
+#endif
+#ifndef S_NC
+#define S_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+ {
+
+#if defined(REF_BLAS_OPENBLAS)
+ openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+ omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+ mkl_set_num_threads(1);
+#endif
+
+ printf("\n");
+ printf("\n");
+ printf("\n");
+
+ printf("BLAS performance test - float precision\n");
+ printf("\n");
+
+ // maximum frequency of the processor
+ const float GHz_max = GHZ_MAX;
+ printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+ printf("\n");
+
+ // maximum flops per cycle, single precision
+ // maxumum memops (sustained load->store of floats) per cycle, single precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+ const float flops_max = 32; // 2x256 bit fma
+ const float memops_max = 8; // 2x256 bit load + 1x256 bit store
+ printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ const float flops_max = 16; // 1x256 bit mul + 1x256 bit add
+ const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+ printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+ const float flops_max = 8; // 1x128 bit mul + 1x128 bit add
+ const float memops_max = 4; // 1x128 bit load + 1x128 bit store;
+ printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+ const float flops_max = 16; // 2x128 bit fma
+ const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+ printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ const float flops_max = 8; // 1x128 bit fma
+ const float memops_max = 4; // ???
+ printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+ const float flops_max = 8; // 1x128 bit fma
+ const float memops_max = 4; // ???
+ printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+ const float flops_max = 2; // 1x32 bit mul + 1x32 bit add ???
+ const float memops_max = 1; // ???
+ printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+
+// FILE *f;
+// f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+// fprintf(f, "C = 's_x64_intel_haswell';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+// fprintf(f, "C = 's_x64_intel_sandybridge';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+// fprintf(f, "C = 's_x64_intel_core';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+// fprintf(f, "C = 's_x64_amd_bulldozer';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+// fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+// fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+// fprintf(f, "C = 's_generic';\n");
+// fprintf(f, "\n");
+#endif
+
+// fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+// fprintf(f, "\n");
+
+// fprintf(f, "B = [\n");
+
+
+
+ int i, j, rep, ll;
+
+ const int bss = S_PS;
+ const int ncs = S_NC;
+
+/* int info = 0;*/
+
+ printf("\nn\t sgemm_blasfeo\t sgemm_blas\n");
+ printf("\nn\t Gflops\t %%\t Gflops\t %%\n\n");
+
+#if 1
+ int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+ int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+
+// for(ll=0; ll<24; ll++)
+ for(ll=0; ll<75; ll++)
+// for(ll=0; ll<115; ll++)
+// for(ll=0; ll<120; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = nnrep[ll];
+// int n = ll+1;
+// int nrep = nnrep[0];
+// n = n<16 ? 16 : n;
+
+ int n2 = n*n;
+
+#else
+ int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+
+ for(ll=0; ll<24; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = 40000; //nnrep[ll];
+#endif
+
+ float *A; s_zeros(&A, n, n);
+ float *B; s_zeros(&B, n, n);
+ float *C; s_zeros(&C, n, n);
+ float *M; s_zeros(&M, n, n);
+
+ char c_n = 'n';
+ char c_l = 'l';
+ char c_r = 'r';
+ char c_t = 't';
+ char c_u = 'u';
+ int i_1 = 1;
+ int i_t;
+ float d_1 = 1;
+ float d_0 = 0;
+
+ for(i=0; i<n*n; i++)
+ A[i] = i;
+
+ for(i=0; i<n; i++)
+ B[i*(n+1)] = 1;
+
+ for(i=0; i<n*n; i++)
+ M[i] = 1;
+
+ float *B2; s_zeros(&B2, n, n);
+ for(i=0; i<n*n; i++)
+ B2[i] = 1e-15;
+ for(i=0; i<n; i++)
+ B2[i*(n+1)] = 1;
+
+ float *x; s_zeros(&x, n, 1);
+ float *y; s_zeros(&y, n, 1);
+ float *x2; s_zeros(&x2, n, 1);
+ float *y2; s_zeros(&y2, n, 1);
+ float *diag; s_zeros(&diag, n, 1);
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+// for(i=0; i<n; i++) x[i] = 1;
+// for(i=0; i<n; i++) x2[i] = 1;
+
+ // matrix struct
+#if 0
+ struct s_strmat sA; s_allocate_strmat(n+4, n+4, &sA);
+ struct s_strmat sB; s_allocate_strmat(n+4, n+4, &sB);
+ struct s_strmat sC; s_allocate_strmat(n+4, n+4, &sC);
+ struct s_strmat sD; s_allocate_strmat(n+4, n+4, &sD);
+ struct s_strmat sE; s_allocate_strmat(n+4, n+4, &sE);
+#else
+ struct s_strmat sA; s_allocate_strmat(n, n, &sA);
+ struct s_strmat sB; s_allocate_strmat(n, n, &sB);
+ struct s_strmat sC; s_allocate_strmat(n, n, &sC);
+ struct s_strmat sD; s_allocate_strmat(n, n, &sD);
+ struct s_strmat sE; s_allocate_strmat(n, n, &sE);
+#endif
+ struct s_strvec sx; s_allocate_strvec(n, &sx);
+ struct s_strvec sy; s_allocate_strvec(n, &sy);
+ struct s_strvec sz; s_allocate_strvec(n, &sz);
+
+ s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ s_cvt_vec2strvec(n, x, &sx, 0);
+
+
+ // create matrix to pivot all the time
+// sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+ float *dummy;
+
+ int info;
+
+ /* timing */
+ struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+ /* warm up */
+ for(rep=0; rep<nrep; rep++)
+ {
+ sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+ }
+
+ float alpha = 1.0;
+ float beta = 0.0;
+
+ gettimeofday(&tv0, NULL); // stop
+
+ gettimeofday(&tv1, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+// kernel_sgemm_nt_24x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_16x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_8x4_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_4x8_gen_lib8(n, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+// kernel_sgemm_nt_4x8_vs_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 4, 8);
+// kernel_sgemm_nt_4x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nn_16x4_lib8(n, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nn_8x8_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nn_8x4_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+// sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// sgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// spotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+ spotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+// sgetr_libstr(n, n, &sA, 0, 0, &sB, 0, 0);
+// sgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+// sgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+// strmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// strmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// strsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// strsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// strsm_rltn_libstr(n, n, 1.0, &sB, 0, 0, &sD, 0, 0, &sD, 0, 0);
+// strsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// strsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// sgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// sgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// ssymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// sgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+ }
+
+// d_print_strmat(n, n, &sD, 0, 0);
+
+ gettimeofday(&tv2, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+// sgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// sgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// scopy_(&n2, A, &i_1, B, &i_1);
+// ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+// strmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+// spotrf_(&c_l, &n, B2, &n, &info);
+// sgetrf_(&n, &n, B2, &n, ipiv, &info);
+// strsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+// strsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+// strtri_(&c_l, &c_n, &n, B2, &n, &info);
+// slauum_(&c_l, &n, B, &n, &info);
+// sgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+// sgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+// strmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// strsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// ssymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+// for(i=0; i<n; i++)
+// {
+// i_t = n-i;
+// scopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+// }
+// ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+// spotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+// sgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// sgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// ssyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+// strmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+// spotrf_(&c_l, &n77, B, &n77, &info);
+// strtri_(&c_l, &c_n, &n77, B, &n77, &info);
+// slauum_(&c_l, &n77, B, &n77, &info);
+#endif
+ }
+
+ gettimeofday(&tv3, NULL); // stop
+
+ // flops
+ if(1)
+ {
+
+ float Gflops_max = flops_max * GHz_max;
+
+// float flop_operation = 6*16.0*2*n; // kernel 24x4
+// float flop_operation = 4*16.0*2*n; // kernel 16x4
+// float flop_operation = 3*16.0*2*n; // kernel 12x4
+// float flop_operation = 2*16.0*2*n; // kernel 8x4
+// float flop_operation = 1*16.0*2*n; // kernel 4x4
+
+// float flop_operation = 2.0*n*n*n; // dgemm
+// float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+ float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+// float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+// float flop_operation = 2.0*n*n; // dgemv dsymv
+// float flop_operation = 1.0*n*n; // dtrmv dtrsv
+// float flop_operation = 4.0*n*n; // dgemv_nt
+// float flop_operation = 3*16.0*2*n; // kernel 12x4
+
+// float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+ float time_hpmpc = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_blasfeo = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_blas = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ float Gflops_hpmpc = 1e-9*flop_operation/time_hpmpc;
+ float Gflops_blasfeo = 1e-9*flop_operation/time_blasfeo;
+ float Gflops_blas = 1e-9*flop_operation/time_blas;
+
+
+ printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+ }
+ // memops
+ else
+ {
+
+ float Gmemops_max = memops_max * GHz_max;
+
+ float memop_operation = 1.0*n*n; // dgecp
+
+ float time_hpmpc = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_blasfeo = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_blas = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ float Gmemops_hpmpc = 1e-9*memop_operation/time_hpmpc;
+ float Gmemops_blasfeo = 1e-9*memop_operation/time_blasfeo;
+ float Gmemops_blas = 1e-9*memop_operation/time_blas;
+
+
+ printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+
+ }
+
+
+ free(A);
+ free(B);
+ free(B2);
+ free(M);
+ free(x);
+ free(y);
+ free(x2);
+ free(y2);
+ free(ipiv);
+
+ s_free_strmat(&sA);
+ s_free_strmat(&sB);
+ s_free_strmat(&sC);
+ s_free_strmat(&sD);
+ s_free_strmat(&sE);
+ s_free_strvec(&sx);
+ s_free_strvec(&sy);
+ s_free_strvec(&sz);
+
+ }
+
+ printf("\n");
+
+// fprintf(f, "];\n");
+// fclose(f);
+
+ return 0;
+
+ }
+
diff --git a/test_problems/test_d_strmat.c b/test_problems/test_d_strmat.c
new file mode 100644
index 0000000..e06cf84
--- /dev/null
+++ b/test_problems/test_d_strmat.c
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+ {
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+ double *A; d_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// d_print_mat(n, n, A, n);
+
+ double *B; d_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// d_print_mat(n, n, B, n);
+
+ double *C; d_zeros(&C, n, n);
+
+ double *D; d_zeros(&D, n, n);
+ for(ii=0; ii<n*n; ii++) D[ii] = -1;
+
+ double *x_n; d_zeros(&x_n, n, 1);
+// for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+ x_n[1] = 1.0;
+// x_n[1] = 1.0;
+// x_n[2] = 2.0;
+// x_n[3] = 3.0;
+ double *x_t; d_zeros(&x_t, n, 1);
+// for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+ x_t[0] = 1.0;
+ double *y_n; d_zeros(&y_n, n, 1);
+ double *y_t; d_zeros(&y_t, n, 1);
+ double *z_n; d_zeros(&z_n, n, 1);
+ double *z_t; d_zeros(&z_t, n, 1);
+
+ double *x0; d_zeros(&x0, n, 1); x0[0] = 1.0;
+ double *x1; d_zeros(&x1, n, 1); x1[1] = 1.0;
+ double *x2; d_zeros(&x2, n, 1); x2[2] = 1.0;
+ double *x3; d_zeros(&x3, n, 1); x3[3] = 1.0;
+ double *x4; d_zeros(&x4, n, 1); x4[4] = 1.0;
+ double *x5; d_zeros(&x5, n, 1); x5[5] = 1.0;
+ double *x6; d_zeros(&x6, n, 1); x6[6] = 1.0;
+ double *x7; d_zeros(&x7, n, 1); x7[7] = 1.0;
+ double *x8; d_zeros(&x8, n, 1); x8[8] = 1.0;
+ double *x9; d_zeros(&x9, n, 1); x9[9] = 1.0;
+
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ //
+ // matrices in matrix struct format
+ //
+ int size_strmat = 5*d_size_strmat(n, n);
+ void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+ char *ptr_memory_strmat = (char *) memory_strmat;
+
+ struct d_strmat sA;
+// d_allocate_strmat(n, n, &sA);
+ d_create_strmat(n, n, &sA, ptr_memory_strmat);
+ ptr_memory_strmat += sA.memory_size;
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+// d_cast_mat2strmat(A, &sA);
+ d_print_strmat(n, n, &sA, 0, 0);
+
+ struct d_strmat sB;
+// d_allocate_strmat(n, n, &sB);
+ d_create_strmat(n, n, &sB, ptr_memory_strmat);
+ ptr_memory_strmat += sB.memory_size;
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ struct d_strmat sC;
+// d_allocate_strmat(n, n, &sC);
+ d_create_strmat(n, n, &sC, ptr_memory_strmat);
+ ptr_memory_strmat += sC.memory_size;
+
+ struct d_strmat sD;
+// d_allocate_strmat(n, n, &sD);
+ d_create_strmat(n, n, &sD, ptr_memory_strmat);
+ ptr_memory_strmat += sD.memory_size;
+ d_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+ struct d_strmat sE;
+// d_allocate_strmat(n, n, &sE);
+ d_create_strmat(n, n, &sE, ptr_memory_strmat);
+ ptr_memory_strmat += sE.memory_size;
+
+ struct d_strvec sx_n;
+ d_allocate_strvec(n, &sx_n);
+ d_cvt_vec2strvec(n, x_n, &sx_n, 0);
+
+ struct d_strvec sx_t;
+ d_allocate_strvec(n, &sx_t);
+ d_cvt_vec2strvec(n, x_t, &sx_t, 0);
+
+ struct d_strvec sy_n;
+ d_allocate_strvec(n, &sy_n);
+ d_cvt_vec2strvec(n, y_n, &sy_n, 0);
+
+ struct d_strvec sy_t;
+ d_allocate_strvec(n, &sy_t);
+ d_cvt_vec2strvec(n, y_t, &sy_t, 0);
+
+ struct d_strvec sz_n;
+ d_allocate_strvec(n, &sz_n);
+ d_cvt_vec2strvec(n, z_n, &sz_n, 0);
+
+ struct d_strvec sz_t;
+ d_allocate_strvec(n, &sz_t);
+ d_cvt_vec2strvec(n, z_t, &sz_t, 0);
+
+ struct d_strvec sx0; d_create_strvec(n, &sx0, x0);
+ struct d_strvec sx1; d_create_strvec(n, &sx1, x1);
+ struct d_strvec sx2; d_create_strvec(n, &sx2, x2);
+ struct d_strvec sx3; d_create_strvec(n, &sx3, x3);
+ struct d_strvec sx4; d_create_strvec(n, &sx4, x4);
+ struct d_strvec sx5; d_create_strvec(n, &sx5, x5);
+ struct d_strvec sx6; d_create_strvec(n, &sx6, x6);
+ struct d_strvec sx7; d_create_strvec(n, &sx7, x7);
+ struct d_strvec sx8; d_create_strvec(n, &sx8, x8);
+ struct d_strvec sx9; d_create_strvec(n, &sx9, x9);
+
+ struct d_strvec sz0; d_allocate_strvec(n, &sz0);
+ struct d_strvec sz1; d_allocate_strvec(n, &sz1);
+ struct d_strvec sz2; d_allocate_strvec(n, &sz2);
+ struct d_strvec sz3; d_allocate_strvec(n, &sz3);
+ struct d_strvec sz4; d_allocate_strvec(n, &sz4);
+ struct d_strvec sz5; d_allocate_strvec(n, &sz5);
+ struct d_strvec sz6; d_allocate_strvec(n, &sz6);
+ struct d_strvec sz7; d_allocate_strvec(n, &sz7);
+ struct d_strvec sz8; d_allocate_strvec(n, &sz8);
+ struct d_strvec sz9; d_allocate_strvec(n, &sz9);
+
+ // tests
+ double *v; d_zeros(&v, n, 1);
+ double *vp; d_zeros(&vp, n, 1);
+ double *vm; d_zeros(&vm, n, 1);
+ double *m; d_zeros(&m, n, 1);
+ double *r; d_zeros(&r, n, 1);
+
+ for(ii=0; ii<n; ii++) v[ii] = ii; // x
+ for(ii=0; ii<n; ii++) vp[ii] = 8.0; // upper
+ for(ii=0; ii<n; ii++) vm[ii] = 3.0; // lower
+ for(ii=0; ii<n; ii++) r[ii] = 2*ii+1; // x
+
+ d_print_mat(1, n, v, 1);
+ d_print_mat(1, n, vp, 1);
+ d_print_mat(1, n, vm, 1);
+ d_print_mat(1, n, r, 1);
+
+ struct d_strvec sv; d_create_strvec(n, &sv, v);
+ struct d_strvec svp; d_create_strvec(n, &svp, vp);
+ struct d_strvec svm; d_create_strvec(n, &svm, vm);
+ struct d_strvec sm; d_create_strvec(n, &sm, m);
+ struct d_strvec sr; d_create_strvec(n, &sr, r);
+
+// d_print_tran_strvec(n, &sv, 0);
+// d_print_tran_strvec(n, &svp, 0);
+// d_print_tran_strvec(n, &svm, 0);
+// d_print_tran_strvec(n, &sm, 0);
+// d_print_tran_strvec(n, &sr, 0);
+
+// d_print_tran_strvec(n, &sm, 0);
+// DVECEL_LIBSTR(&sm, 0) = 0.0;
+// DVECEL_LIBSTR(&sm, 1) = 1.0;
+// DVECEL_LIBSTR(&sm, 2) = 2.0;
+// d_print_tran_strvec(n, &sm, 0);
+// return 0;
+
+ double alpha = 1.0;
+ double beta = 0.0;
+ kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sA.cn, 0, sD.pA, sE.cn, 1, 3, 1, 3);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+ dtrmm_rlnn_libstr(8, 8, alpha, &sA, 3, 0, &sB, 0, 0, &sD, 0, 0);
+// dgemm_nn_libstr(8, 8, 8, alpha, &sB, 0, 0, &sA, 1, 0, beta, &sA, 0, 0, &sD, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+// dsyrk_ln_libstr(n, 15, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+// dpotrf_l_mn_libstr(n, 15, &sD, 0, 0, &sD, 0, 0);
+// dsyrk_dpotrf_ln_libstr(n, 15, n, &sA, 0, 0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// dtrmm_rlnn_libstr(n, n, alpha, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+// kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+// kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+// kernel_dsyrk_nt_l_4x4_gen_lib4(n, &alpha, sA.pA, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 4, 0, 4);
+// kernel_dsyrk_nt_l_8x4_gen_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 8, 0, 8);
+// dsyrk_ln_libstr(10, 10, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 1, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx0, 0, beta, &sz0, 0, &sz0, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx1, 0, beta, &sz1, 0, &sz1, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx2, 0, beta, &sz2, 0, &sz2, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx3, 0, beta, &sz3, 0, &sz3, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx4, 0, beta, &sz4, 0, &sz4, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx5, 0, beta, &sz5, 0, &sz5, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx6, 0, beta, &sz6, 0, &sz6, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx7, 0, beta, &sz7, 0, &sz7, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx8, 0, beta, &sz8, 0, &sz8, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx9, 0, beta, &sz9, 0, &sz9, 0);
+ d_print_tran_strvec(n, &sz0, 0);
+ d_print_tran_strvec(n, &sz1, 0);
+ d_print_tran_strvec(n, &sz2, 0);
+ d_print_tran_strvec(n, &sz3, 0);
+ d_print_tran_strvec(n, &sz4, 0);
+ d_print_tran_strvec(n, &sz5, 0);
+ d_print_tran_strvec(n, &sz6, 0);
+ d_print_tran_strvec(n, &sz7, 0);
+ d_print_tran_strvec(n, &sz8, 0);
+ d_print_tran_strvec(n, &sz9, 0);
+ return 0;
+
+// d_print_strmat(n, n, &sC, 0, 0);
+// dgese_libstr(n, n, 1.0, &sB, 0, 0);
+// kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+// kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+ return 0;
+
+// d_print_strmat(n, n, &sC, 0, 0);
+// dgese_libstr(n, n, 1.0, &sB, 0, 0);
+// kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+// kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+// kernel_dger4_sub_8_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+// kernel_dger4_sub_8_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 5);
+// kernel_dger4_sub_12_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+// kernel_dger4_sub_12_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 9);
+// kernel_dger4_sub_8c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+// kernel_dger4_sub_4c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+// d_print_strmat(n, n, &sC, 0, 0);
+// return 0;
+
+#if 1
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+#else
+ dgese_libstr(n, n, 0.1, &sC, 0, 0);
+ DMATEL_LIBSTR(&sC, 0, 0) = 1.0;
+// DMATEL_LIBSTR(&sC, 0, 1) = 1.0;
+ for(ii=1; ii<n-1; ii++)
+ {
+// DMATEL_LIBSTR(&sC, ii, ii-1) = 1.0;
+ DMATEL_LIBSTR(&sC, ii, ii) = 1.0;
+// DMATEL_LIBSTR(&sC, ii, ii+1) = 1.0;
+ }
+// DMATEL_LIBSTR(&sC, n-1, n-2) = 1.0;
+ DMATEL_LIBSTR(&sC, n-1, n-1) = 1.0;
+#endif
+ d_print_strmat(n, n, &sC, 0, 0);
+ dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+// d_print_strmat(n, n, &sA, 0, 0);
+// dgein1_libstr(12.0, &sA, 0, 0);
+// DMATEL_LIBSTR(&sA, 0, 0) = 12.0;
+// DMATEL_LIBSTR(&sA, 1, 0) = 6.0;
+// DMATEL_LIBSTR(&sA, 2, 0) = - 4.0;
+// DMATEL_LIBSTR(&sA, 0, 1) = - 51.0;
+// DMATEL_LIBSTR(&sA, 1, 1) = 167.0;
+// DMATEL_LIBSTR(&sA, 2, 1) = 24.0;
+// DMATEL_LIBSTR(&sA, 0, 2) = 4.0;
+// DMATEL_LIBSTR(&sA, 1, 2) = - 68.0;
+// DMATEL_LIBSTR(&sA, 2, 2) = - 41.0;
+// d_print_strmat(n, n, &sA, 0, 0);
+ d_print_strmat(n, n, &sC, 0, 0);
+// printf("\n%f\n", DGEEL_LIBSTR(&sA, 0, 0));
+// int qr_work_size = dgeqrf_work_size_libstr(n, n);
+ int qr_work_size = dgelqf_work_size_libstr(n, n);
+ void *qr_work;
+ v_zeros_align(&qr_work, qr_work_size);
+// dgeqrf_libstr(10, 10, &sC, 0, 0, &sD, 0, 0, qr_work);
+ dgelqf_libstr(17, 17, &sC, 0, 0, &sD, 0, 0, qr_work);
+// dgecp_libstr(10, 10, &sC, 0, 0, &sD, 0, 0);
+// kernel_dgeqrf_4_lib4(16, 12, sD.pA, sD.cn, sD.dA, qr_work);
+// d_print_strmat(n, n, &sA, 0, 0);
+// kernel_dgeqrf_vs_lib4(10, 16, 0, sD.pA+0, sD.cn, sD.dA);
+// kernel_dgelqf_vs_lib4(10, 10, 10, 0, sD.pA+0, sD.cn, sD.dA);
+ d_print_strmat(n, n, &sD, 0, 0);
+ free(qr_work);
+ return 0;
+
+// dveccl_mask_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0, &sm, 0);
+// veccl_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0);
+// d_print_tran_strvec(12, &sv, 0);
+// d_print_tran_strvec(12, &sm, 0);
+// dvecze_libstr(n, &sm, 0, &sr, 0, &sr, 0);
+// d_print_tran_strvec(12, &sr, 0);
+// return 0;
+
+// d_print_strmat(n, n, &sA, 0, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx0, 0, &sz0, 0);
+// d_print_tran_strvec(n, &sz0, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx1, 0, &sz1, 0);
+// d_print_tran_strvec(n, &sz1, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx2, 0, &sz2, 0);
+// d_print_tran_strvec(n, &sz2, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx3, 0, &sz3, 0);
+// d_print_tran_strvec(n, &sz3, 0);
+// return 0;
+
+// double alpha = 1.0;
+// double beta = 1.0;
+// kernel_dgemm_nt_4x12_vs_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA, 3, 10);
+// kernel_dgemm_nt_8x8u_vs_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn, 7, 6);
+ dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ dpotrf_l_libstr(16, &sD, 0, 0, &sD, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;;
+
+// dmatse_libstr(n, n, 100.0, &sD, 0, 0);
+
+// for(ii=0; ii<n; ii++)
+// dvecin1_libstr(ii+1, &sx_n, ii);
+// d_print_tran_strvec(n, &sx_n, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// // ddiain_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+// ddiaad_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+// d_print_strmat(n, n, &sD, 0, 0);
+// return 0;
+
+// d_print_tran_strvec(n, &sx_n, 0);
+// dgemm_l_diag_libstr(n, n, 1.0, &sx_n, 0, &sA, 0, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+// dgemm_r_diag_libstr(n, n, 1.0, &sA, 0, 0, &sx_n, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// exit(1);
+
+// dsetmat_libstr(n, n, 0.0, &sD, 0, 0);
+// dmatin1_libstr(2.0, &sD, 0, 0);
+// dmatin1_libstr(2.0, &sD, 1, 1);
+// dmatin1_libstr(2.0, &sD, 2, 2);
+// dmatin1_libstr(1.0, &sD, 1, 0);
+// dmatin1_libstr(1.0, &sD, 2, 1);
+// dmatin1_libstr(0.5, &sD, 2, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// d_print_tran_strvec(n, &sx_n, 0);
+// dtrsv_lnn_libstr(n, n, &sD, 0, 0, &sx_n, 0, &sz_n, 0);
+// d_print_tran_strvec(n, &sz_n, 0);
+// exit(1);
+
+// dgemm_nt_libstr(8, 8, 8, 1.0, &sB, 0, 0, &sA, 1, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// return 0;
+
+// double alpha = 1.0;
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA, 2, sA.pA, sA.cn, 1, sD.pA, sD.cn, 0, 4, 1, 4);
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA+sB.cn*4, 2, sA.pA, sA.cn, 1, sD.pA+sD.cn*4, sD.cn, 0, 4, 1, 4);
+// kernel_dtrmm_nn_rl_4x4_lib4(4, &alpha, sB.pA, sA.pA, sA.cn+4*4, sD.pA+4*4);
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(3, &alpha, sB.pA+sB.cn*4+4*4, 2, sA.pA+sB.cn*4+4*4, sA.cn, 1, sD.pA+sD.cn*4+4*4, sD.cn, 0, 4, 0, 4);
+ dtrmm_rlnn_libstr(8, 8, 1.0, &sB, 0, 0, &sA, 3, 0, &sD, 2, 1);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx0, 0, &sx0, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx1, 0, &sx1, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx2, 0, &sx2, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx3, 0, &sx3, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx4, 0, &sx4, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx5, 0, &sx5, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx6, 0, &sx6, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx7, 0, &sx7, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx8, 0, &sx8, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx9, 0, &sx9, 0);
+ d_print_tran_strvec(n, &sx0, 0);
+ d_print_tran_strvec(n, &sx1, 0);
+ d_print_tran_strvec(n, &sx2, 0);
+ d_print_tran_strvec(n, &sx3, 0);
+ d_print_tran_strvec(n, &sx4, 0);
+ d_print_tran_strvec(n, &sx5, 0);
+ d_print_tran_strvec(n, &sx6, 0);
+ d_print_tran_strvec(n, &sx7, 0);
+ d_print_tran_strvec(n, &sx8, 0);
+ d_print_tran_strvec(n, &sx9, 0);
+ return 0;
+
+ dgemv_t_libstr(2, 8, 1.0, &sA, 2, 0, &sx_n, 0, 0.0, &sy_n, 0, &sz_n, 0);
+ d_print_tran_strvec(n, &sz_n, 0);
+ return 0;
+
+ dgemm_nt_libstr(4, 8, 8, 1.0, &sB, 0, 0, &sA, 0, 0, 0.0, &sB, 0, 0, &sD, 3, 0);
+// d_print_strmat(n, n, &sB, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ exit(1);
+
+ dpotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+// dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+// dgetrf_libstr(n, n, &sD, 0, 0, &sD, 0, 0, ipiv);
+ d_print_strmat(n, n, &sD, 0, 0);
+#if defined(LA_HIGH_PERFORMANCE) | defined(LA_REFERENCE)
+ d_print_mat(1, n, sD.dA, 1);
+#endif
+ int_print_mat(1, n, ipiv, 1);
+ dtrsm_rltn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+ exit(1);
+
+#if 1 // solve P L U X = P B
+ d_print_strmat(n, n, &sB, 0, 0);
+ drowpe_libstr(n, ipiv, &sB);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+ dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+#else // solve X^T (P L U)^T = B^T P^T
+ d_print_strmat(n, n, &sB, 0, 0);
+ dcolpe_libstr(n, ipiv, &sB);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+ dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+#endif
+
+// d_print_strmat(n, n, &sA, 0, 0);
+// d_print_strmat(n, n, &sB, 0, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// d_print_strmat(n, n, &sE, 0, 0);
+
+// d_cvt_strmat2mat(n, n, &sE, 0, 0, C, n);
+// d_print_mat(n, n, C, n);
+
+ dtrtr_u_libstr(6, &sE, 2, 0, &sB, 1, 0);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ d_print_strmat(n, n, &sA, 0, 0);
+ dgemv_nt_libstr(6, n, 1.0, 1.0, &sA, 0, 0, &sx_n, 0, &sx_t, 0, 0.0, 0.0, &sy_n, 0, &sy_t, 0, &sz_n, 0, &sz_t, 0);
+// dsymv_l_libstr(5, 5, 1.0, &sA, 0, 0, x_n, 0.0, y_n, z_n);
+ d_print_mat(1, n, z_n, 1);
+ d_print_mat(1, n, z_t, 1);
+
+
+
+
+// for(ii=0; ii<sE.pm*sE.cn; ii++) sE.pA[ii] = 0.0;
+// double alpha = 0.0;
+// double beta = 1.0;
+// kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 3, sA.pA, sA.cn, 0, sE.pA, sE.cn, 0, 4, 2, 2);
+// d_print_strmat(n, n, &sE, 0, 0);
+
+ // free memory
+ free(A);
+ free(B);
+ free(C);
+ free(D);
+ free(ipiv);
+// d_free_strmat(&sA);
+// d_free_strmat(&sB);
+// d_free_strmat(&sD);
+ v_free_align(memory_strmat);
+
+ return 0;
+
+ }
diff --git a/test_problems/test_s_strmat.c b/test_problems/test_s_strmat.c
new file mode 100644
index 0000000..456db87
--- /dev/null
+++ b/test_problems/test_s_strmat.c
@@ -0,0 +1,191 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+ {
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii, jj;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+ float *A; s_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// for(jj=0; jj<n; jj++)
+// for(ii=0; ii<jj; ii++)
+// A[ii+n*jj] = 0.0/0.0;
+// s_print_mat(n, n, A, n);
+
+ float *B; s_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// s_print_mat(n, n, B, n);
+
+ float *D; s_zeros(&D, n, n);
+ for(ii=0; ii<n*n; ii++) D[ii] = -1.0;
+// s_print_mat(n, n, B, n);
+
+
+ //
+ // matrices in matrix struct format
+ //
+
+ struct s_strmat sA;
+ s_allocate_strmat(n, n, &sA);
+ s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ s_print_strmat(n, n, &sA, 0, 0);
+
+ struct s_strmat sB;
+ s_allocate_strmat(n, n, &sB);
+ s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ s_print_strmat(n, n, &sB, 0, 0);
+
+ struct s_strmat sD;
+ s_allocate_strmat(n, n, &sD);
+ s_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+ struct s_strvec sx;
+ s_allocate_strvec(n, &sx);
+ sx.pa[7] = 1.0;
+ s_print_tran_strvec(n, &sx, 0);
+
+ struct s_strvec sz0;
+ s_allocate_strvec(n, &sz0);
+
+ struct s_strvec sz1;
+ s_allocate_strvec(n, &sz1);
+
+ //
+ // tests
+ //
+
+ float alpha = 1.0;
+ float beta = 0.0;
+// kernel_sgemm_nt_24x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_16x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x8_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_8x4_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_4x8_gen_lib8(8, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+// kernel_sgemm_nt_8x4_vs_lib8(8, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 7, 4);
+// kernel_sgemm_nt_8x4_lib8(8, &alpha, sB.pA, sA.pA+4, &beta, sA.pA+4*8, sD.pA+4*8);
+// kernel_sgemm_nn_16x4_lib8(4, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_12x4_lib4(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x8_lib4(8, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x4_lib4(2, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// s_print_strmat(n, n, &sD, 0, 0);
+// return 0;
+// sgemm_nt_libstr(n, n, 5, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+// ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+// ssyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+// kernel_ssyrk_nt_l_8x8_lib8(n, &alpha, sA.pA, sA.pA, &beta, sB.pA, sD.pA);
+// sgecp_libstr(16, 16, &sA, 2, 0, &sD, 1, 0);
+// sgetr_libstr(16, 16, &sA, 2, 0, &sD, 2, 0);
+// s_print_strmat(n, n, &sD, 0, 0);
+// sgemv_n_libstr(6, 6, 1.0, &sA, 1, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+// sgemv_t_libstr(11, 8, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+// strmv_lnn_libstr(6, 6, &sA, 1, 0, &sx, 0, &sz0, 0);
+// strmv_ltn_libstr(10, 10, &sA, 1, 0, &sx, 0, &sz0, 0);
+// sA.pA[0] = 1.0;
+// strsv_lnn_libstr(10, &sA, 0, 0, &sx, 0, &sz0, 0);
+// for(ii=0; ii<8; ii++) sA.dA[ii] = 1.0/sgeex1_libstr(&sA, ii, ii);
+// kernel_strsv_lt_inv_8_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa);
+// kernel_strsv_lt_inv_8_vs_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa, 3);
+// s_print_strmat(n, n, &sA, 0, 0);
+// strsv_ltn_libstr(12, &sA, 0, 0, &sx, 0, &sz0, 0);
+// strsv_ltn_mn_libstr(11, 3, &sA, 0, 0, &sx, 0, &sz0, 0);
+// s_print_strmat(n, n, &sA, 0, 0);
+// kernel_sgemv_nt_4_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa);
+// kernel_sgemv_nt_4_vs_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa, 3);
+// sgemv_nt_libstr(5, 2, alpha, alpha, &sA, 0, 0, &sx, 0, &sx, 0, beta, beta, &sz0, 0, &sz1, 0, &sz0, 0, &sz1, 0);
+// ssymv_l_libstr(10, 10, alpha, &sA, 1, 0, &sx, 0, beta, &sz0, 0, &sz1, 0);
+// s_print_tran_strvec(n, &sz0, 0);
+// s_print_tran_strvec(n, &sz1, 0);
+// return 0;
+// sgesc_libstr(16, 9, 2.0, &sD, 0, 0);
+// s_print_strmat(n, n, &sD, 0, 0);
+// kernel_spotrf_nt_l_8x8_lib8(0, sD.pA, sD.pA, sD.pA, sD.pA, sx.pa);
+// s_print_strmat(n, n, &sD, 0, 0);
+// s_print_tran_strvec(n, &sx, 0);
+// kernel_strsm_nt_rl_inv_8x8_lib8(0, sD.pA, sD.pA, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA, sx.pa);
+// s_print_strmat(n, n, &sD, 0, 0);
+// kernel_spotrf_nt_l_8x8_lib8(8, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA+8*sD.cn+8*8, sD.pA+8*sD.cn+8*8, sx.pa+8);
+// spotrf_l_mn_libstr(23, 17, &sD, 0, 0, &sD, 0, 0);
+// spotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+// kernel_strmm_nn_rl_8x4_lib8(3, &alpha, sB.pA, 7, sA.pA, sA.cn, sD.pA);
+ strmm_rlnn_libstr(12, 8, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+ s_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+
+
+
+ //
+ // free memory
+ //
+
+ free(A);
+ free(B);
+ free(D);
+ s_free_strmat(&sA);
+ s_free_strmat(&sB);
+ s_free_strmat(&sD);
+
+ return 0;
+
+ }