Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/c99/Makefile b/kernel/c99/Makefile
new file mode 100644
index 0000000..55d54ef
--- /dev/null
+++ b/kernel/c99/Makefile
@@ -0,0 +1,80 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemv_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemv_4_lib4.o
+#OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+OBJS +=
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/c99/kernel_dgemm_4x4_lib4.c b/kernel/c99/kernel_dgemm_4x4_lib4.c
new file mode 100644
index 0000000..167e356
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_4x4_lib4.c
@@ -0,0 +1,6825 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC)
+void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ k = 0;
+ if(offsetB!=0)
+ {
+ if(offsetB==1)
+ {
+
+ B += 1;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ B += 2;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ else // if(offsetB==3)
+ {
+
+ B += 3;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ }
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ scale:
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0,
+ c_10=0, c_11=0,
+ c_20=0, c_21=0, c_22=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0,
+ c_10=0, c_11=0,
+ c_20=0, c_21=0, c_22=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[2+bs*2] = c_22;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ k = 0;
+
+ // k = 0
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 1
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 2
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double *D1;
+
+ int k;
+
+ B += offsetB;
+
+ k = 0;
+
+ if(offsetB==0)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==1)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 5
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else // if(offetB==3)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ store:
+
+ c_00 = alpha[0]*c_00;
+ c_10 = alpha[0]*c_10;
+ c_20 = alpha[0]*c_20;
+ c_30 = alpha[0]*c_30;
+
+ c_01 = alpha[0]*c_01;
+ c_11 = alpha[0]*c_11;
+ c_21 = alpha[0]*c_21;
+ c_31 = alpha[0]*c_31;
+
+ c_02 = alpha[0]*c_02;
+ c_12 = alpha[0]*c_12;
+ c_22 = alpha[0]*c_22;
+ c_32 = alpha[0]*c_32;
+
+ c_03 = alpha[0]*c_03;
+ c_13 = alpha[0]*c_13;
+ c_23 = alpha[0]*c_23;
+ c_33 = alpha[0]*c_33;
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+// c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+// c_02 = C[0+bs*2] + c_02;
+// c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+// c_03 = C[0+bs*3] + c_03;
+// c_13 = C[1+bs*3] + c_13;
+// c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(c_00>0)
+ {
+ c_00 = sqrt(c_00);
+ tmp = 1.0/c_00;
+ }
+ else
+ {
+ c_00 = 0.0;
+ tmp = 0.0;
+ }
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ c_11 -= c_10 * c_10;
+ c_21 -= c_20 * c_10;
+ c_31 -= c_30 * c_10;
+ if(c_11>0)
+ {
+ c_11 = sqrt(c_11);
+ tmp = 1.0/c_11;
+ }
+ else
+ {
+ c_11 = 0.0;
+ tmp = 0.0;
+ }
+ c_21 *= tmp;
+ c_31 *= tmp;
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ c_22 -= c_20 * c_20;
+ c_32 -= c_30 * c_20;
+ c_22 -= c_21 * c_21;
+ c_32 -= c_31 * c_21;
+ if(c_22>0)
+ {
+ c_22 = sqrt(c_22);
+ tmp = 1.0/c_22;
+ }
+ else
+ {
+ c_22 = 0.0;
+ tmp = 0.0;
+ }
+ c_32 *= tmp;
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ c_33 -= c_30 * c_30;
+ c_33 -= c_31 * c_31;
+ c_33 -= c_32 * c_32;
+ if(c_33>0)
+ {
+ c_33 = sqrt(c_33);
+ tmp = 1.0/c_33;
+ }
+ else
+ {
+ c_33 = 0.0;
+ tmp = 0.0;
+ }
+ inv_diag_D[3] = tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+
+ if(kn>3)
+ {
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+ tmp = E[2+bs*3];
+ c_02 -= c_03 * tmp;
+ c_12 -= c_13 * tmp;
+ c_22 -= c_23 * tmp;
+ c_32 -= c_33 * tmp;
+ tmp = E[1+bs*3];
+ c_01 -= c_03 * tmp;
+ c_11 -= c_13 * tmp;
+ c_21 -= c_23 * tmp;
+ c_31 -= c_33 * tmp;
+ tmp = E[0+bs*3];
+ c_00 -= c_03 * tmp;
+ c_10 -= c_13 * tmp;
+ c_20 -= c_23 * tmp;
+ c_30 -= c_33 * tmp;
+ }
+
+ if(kn>2)
+ {
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+ tmp = E[1+bs*2];
+ c_01 -= c_02 * tmp;
+ c_11 -= c_12 * tmp;
+ c_21 -= c_22 * tmp;
+ c_31 -= c_32 * tmp;
+ tmp = E[0+bs*2];
+ c_00 -= c_02 * tmp;
+ c_10 -= c_12 * tmp;
+ c_20 -= c_22 * tmp;
+ c_30 -= c_32 * tmp;
+ }
+
+ if(kn>1)
+ {
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+ tmp = E[0+bs*1];
+ c_00 -= c_01 * tmp;
+ c_10 -= c_11 * tmp;
+ c_20 -= c_21 * tmp;
+ c_30 -= c_31 * tmp;
+ }
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // factorization
+
+ // first column
+ tmp = 1.0 / c_00;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ // second column
+ c_11 -= c_10 * c_01;
+ c_21 -= c_20 * c_01;
+ c_31 -= c_30 * c_01;
+
+ tmp = 1.0 / c_11;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ // third column
+ c_12 -= c_10 * c_02;
+ c_22 -= c_20 * c_02;
+ c_32 -= c_30 * c_02;
+
+ c_22 -= c_21 * c_12;
+ c_32 -= c_31 * c_12;
+
+ tmp = 1.0 / c_22;
+ c_32 *= tmp;
+
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ // fourth column
+ c_13 -= c_10 * c_03;
+ c_23 -= c_20 * c_03;
+ c_33 -= c_30 * c_03;
+
+ c_23 -= c_21 * c_13;
+ c_33 -= c_31 * c_13;
+
+ c_33 -= c_32 * c_23;
+
+ tmp = 1.0 / c_33;
+
+ inv_diag_D[3] = tmp;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_1, e_2, e_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solution
+
+ if(km==1)
+ goto store;
+
+ e_1 = E[1+bs*0];
+ e_2 = E[2+bs*0];
+ e_3 = E[3+bs*0];
+ c_10 -= e_1 * c_00;
+ c_20 -= e_2 * c_00;
+ c_30 -= e_3 * c_00;
+ c_11 -= e_1 * c_01;
+ c_21 -= e_2 * c_01;
+ c_31 -= e_3 * c_01;
+ c_12 -= e_1 * c_02;
+ c_22 -= e_2 * c_02;
+ c_32 -= e_3 * c_02;
+ c_13 -= e_1 * c_03;
+ c_23 -= e_2 * c_03;
+ c_33 -= e_3 * c_03;
+
+ if(km==2)
+ goto store;
+
+ e_2 = E[2+bs*1];
+ e_3 = E[3+bs*1];
+ c_20 -= e_2 * c_10;
+ c_30 -= e_3 * c_10;
+ c_21 -= e_2 * c_11;
+ c_31 -= e_3 * c_11;
+ c_22 -= e_2 * c_12;
+ c_32 -= e_3 * c_12;
+ c_23 -= e_2 * c_13;
+ c_33 -= e_3 * c_13;
+
+ if(km==3)
+ goto store;
+
+ e_3 = E[3+bs*2];
+ c_30 -= e_3 * c_20;
+ c_31 -= e_3 * c_21;
+ c_32 -= e_3 * c_22;
+ c_33 -= e_3 * c_23;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solve
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_10 *= e_00;
+ c_20 *= e_00;
+ c_30 *= e_00;
+
+ if(kn==1)
+ goto store;
+
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_01 -= c_00 * e_01;
+ c_11 -= c_10 * e_01;
+ c_21 -= c_20 * e_01;
+ c_31 -= c_30 * e_01;
+ c_01 *= e_11;
+ c_11 *= e_11;
+ c_21 *= e_11;
+ c_31 *= e_11;
+
+ if(kn==2)
+ goto store;
+
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_02 -= c_00 * e_02;
+ c_12 -= c_10 * e_02;
+ c_22 -= c_20 * e_02;
+ c_32 -= c_30 * e_02;
+ c_02 -= c_01 * e_12;
+ c_12 -= c_11 * e_12;
+ c_22 -= c_21 * e_12;
+ c_32 -= c_31 * e_12;
+ c_02 *= e_22;
+ c_12 *= e_22;
+ c_22 *= e_22;
+ c_32 *= e_22;
+
+ if(kn==3)
+ goto store;
+
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_03 -= c_00 * e_03;
+ c_13 -= c_10 * e_03;
+ c_23 -= c_20 * e_03;
+ c_33 -= c_30 * e_03;
+ c_03 -= c_01 * e_13;
+ c_13 -= c_11 * e_13;
+ c_23 -= c_21 * e_13;
+ c_33 -= c_31 * e_13;
+ c_03 -= c_02 * e_23;
+ c_13 -= c_12 * e_23;
+ c_23 -= c_22 * e_23;
+ c_33 -= c_32 * e_23;
+ c_03 *= e_33;
+ c_13 *= e_33;
+ c_23 *= e_33;
+ c_33 *= e_33;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+
+ // solve
+
+ if(km>3)
+ {
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_30 *= e_33;
+ c_31 *= e_33;
+ c_32 *= e_33;
+ c_33 *= e_33;
+ c_00 -= e_03 * c_30;
+ c_01 -= e_03 * c_31;
+ c_02 -= e_03 * c_32;
+ c_03 -= e_03 * c_33;
+ c_10 -= e_13 * c_30;
+ c_11 -= e_13 * c_31;
+ c_12 -= e_13 * c_32;
+ c_13 -= e_13 * c_33;
+ c_20 -= e_23 * c_30;
+ c_21 -= e_23 * c_31;
+ c_22 -= e_23 * c_32;
+ c_23 -= e_23 * c_33;
+ }
+
+ if(km>2)
+ {
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_20 *= e_22;
+ c_21 *= e_22;
+ c_22 *= e_22;
+ c_23 *= e_22;
+ c_00 -= e_02 * c_20;
+ c_01 -= e_02 * c_21;
+ c_02 -= e_02 * c_22;
+ c_03 -= e_02 * c_23;
+ c_10 -= e_12 * c_20;
+ c_11 -= e_12 * c_21;
+ c_12 -= e_12 * c_22;
+ c_13 -= e_12 * c_23;
+ }
+
+ if(km>1)
+ {
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_10 *= e_11;
+ c_11 *= e_11;
+ c_12 *= e_11;
+ c_13 *= e_11;
+ c_00 -= e_01 * c_10;
+ c_01 -= e_01 * c_11;
+ c_02 -= e_01 * c_12;
+ c_03 -= e_01 * c_13;
+ }
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_01 *= e_00;
+ c_02 *= e_00;
+ c_03 *= e_00;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
diff --git a/kernel/c99/kernel_dgemm_diag_lib4.c b/kernel/c99/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..cad2b21
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,1111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_0;
+ c_2 = a_2 * b_0;
+ c_3 = a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = a_0 * b_1;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_1;
+ c_3 = a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = a_0 * b_2;
+ c_1 = a_1 * b_2;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = a_0 * b_3;
+ c_1 = a_1 * b_3;
+ c_2 = a_2 * b_3;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1,
+ b_0, b_1,
+ c_0, c_1;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
diff --git a/kernel/c99/kernel_dgemv_4_lib4.c b/kernel/c99/kernel_dgemv_4_lib4.c
new file mode 100644
index 0000000..9f11b5f
--- /dev/null
+++ b/kernel/c99/kernel_dgemv_4_lib4.c
@@ -0,0 +1,1009 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ x_0 = x[1];
+
+ y_0 += A[0+bs*1] * x_0;
+ y_1 += A[1+bs*1] * x_0;
+ y_2 += A[2+bs*1] * x_0;
+ y_3 += A[3+bs*1] * x_0;
+
+ x_0 = x[2];
+
+ y_0 += A[0+bs*2] * x_0;
+ y_1 += A[1+bs*2] * x_0;
+ y_2 += A[2+bs*2] * x_0;
+ y_3 += A[3+bs*2] * x_0;
+
+ x_0 = x[3];
+
+ y_0 += A[0+bs*3] * x_0;
+ y_1 += A[1+bs*3] * x_0;
+ y_2 += A[2+bs*3] * x_0;
+ y_3 += A[3+bs*3] * x_0;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(k0<=0 & k1>3)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ if(k0<=0 & k1>0) z[0] = y_0;
+ if(k0<=1 & k1>1) z[1] = y_1;
+ if(k0<=2 & k1>2) z[2] = y_2;
+ if(k0<=3 & k1>3) z[3] = y_3;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_vs_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1)
+ {
+
+ kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z)
+ {
+
+ kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km)
+ {
+
+ const int bs = 4;
+
+ int k, kend;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ if(offA!=0) // 1, 2, 3
+ {
+ kend = 4-offA<kmax ? 4-offA : kmax;
+ for(; k<kend; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+ A += bs*(sda-1);
+ }
+ for(; k<kmax-bs+1; k+=bs)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z)
+ {
+
+ kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_vs_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1)
+ {
+
+ kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_vs_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[1+bs*0] * x_0;
+ y_2 -= A[2+bs*0] * x_0;
+ y_3 -= A[3+bs*0] * x_0;
+
+ y_0 -= A[0+bs*1] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[2+bs*1] * x_1;
+ y_3 -= A[3+bs*1] * x_1;
+
+ y_0 -= A[0+bs*2] * x_2;
+ y_1 -= A[1+bs*2] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[3+bs*2] * x_2;
+
+ y_0 -= A[0+bs*3] * x_3;
+ y_1 -= A[1+bs*3] * x_3;
+ y_2 -= A[2+bs*3] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ double
+ a_00, a_10, a_20, a_30,
+ a_11, a_21, a_31;
+
+ // a_00
+ a_00 = inv_diag_A[0];
+ a_10 = A[1+bs*0];
+ a_20 = A[2+bs*0];
+ a_30 = A[3+bs*0];
+ y_0 *= a_00;
+ z[0] = y_0;
+ y_1 -= a_10 * y_0;
+ y_2 -= a_20 * y_0;
+ y_3 -= a_30 * y_0;
+
+ if(kn==1)
+ {
+ if(km==1)
+ return;
+ y[1] = y_1;
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_11
+ a_11 = inv_diag_A[1];
+ a_21 = A[2+bs*1];
+ a_31 = A[3+bs*1];
+ y_1 *= a_11;
+ z[1] = y_1;
+ y_2 -= a_21 * y_1;
+ y_3 -= a_31 * y_1;
+
+ if(kn==2)
+ {
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_22
+ a_00 = inv_diag_A[2];
+ a_10 = A[3+bs*2];
+ y_2 *= a_00;
+ z[2] = y_2;
+ y_3 -= a_10 * y_2;
+
+ if(kn==3)
+ {
+ if(km==3)
+ return;
+ y[3] = y_3;
+
+ return;
+ }
+
+ // a_33
+ a_11 = inv_diag_A[3];
+ y_3 *= a_11;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ kernel_dtrsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_4_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+ y_3 -= A[1+bs*3] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[2+bs*3] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_3 *= inv_diag_A[3];
+ z[3] = y_3;
+
+ y_2 -= A[3+bs*2] * y_3;
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+ y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_3_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0;
+
+ k = 3;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_3 = x[3];
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 3;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2;
+ y_1 -= A[2+bs*1]*y_2;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_2_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0;
+
+ k = 2;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 2;
+ x += 2;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_1_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0;
+
+ k = 1;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 1;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_un_4_lib4(int kmax, double *A, double *x, double *z)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+/* y_1 += A[1+bs*0] * x_0;*/
+/* y_2 += A[2+bs*0] * x_0;*/
+/* y_3 += A[3+bs*0] * x_0;*/
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+/* y_2 += A[2+bs*1] * x_1;*/
+/* y_3 += A[3+bs*1] * x_1;*/
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+/* y_3 += A[3+bs*2] * x_2;*/
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ k=4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[2+bs*1] * x_1;
+ y_3 += A[3+bs*1] * x_1;
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[3+bs*2] * x_2;
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_vs_lib4(int kmax, double *A, int sda, double *x, double *z, int km)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-4; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+/* y_0 += A[1+bs*0] * x_1;*/
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+/* y_0 += A[2+bs*0] * x_2;*/
+/* y_1 += A[2+bs*1] * x_2;*/
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+/* y_0 += A[3+bs*0] * x_3;*/
+/* y_1 += A[3+bs*1] * x_3;*/
+/* y_2 += A[3+bs*2] * x_3;*/
+ y_3 += A[3+bs*3] * x_3;
+
+// A += sda*bs;
+// x += 4;
+
+ // store_vs
+ store:
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_lib4(int kmax, double *A, int sda, double *x, double *z)
+ {
+
+ kernel_dtrmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+ }
+#endif
+
+
+
+
+
diff --git a/kernel/c99/kernel_dgeqrf_4_lib4.c b/kernel/c99/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..071ec86
--- /dev/null
+++ b/kernel/c99/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2620 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ ii = 1;
+ if(m>1)
+ {
+ tmp = pD[1+ps*0];
+ beta += tmp*tmp;
+ if(m>2)
+ {
+ tmp = pD[2+ps*0];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*0];
+ beta += tmp*tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ ii = 1;
+ if(m>1)
+ {
+ pD[1+ps*0] *= tmp;
+ if(m>2)
+ {
+ pD[2+ps*0] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*0] *= tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*0] *= tmp;
+ pD[1+ii*sdd+ps*0] *= tmp;
+ pD[2+ii*sdd+ps*0] *= tmp;
+ pD[3+ii*sdd+ps*0] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[0+ps*1];
+ w2 = pD[0+ps*2];
+ w3 = pD[0+ps*3];
+ if(m>1)
+ {
+ w1 += pD[1+ps*1] * pD[1+ps*0];
+ w2 += pD[1+ps*2] * pD[1+ps*0];
+ w3 += pD[1+ps*3] * pD[1+ps*0];
+ if(m>2)
+ {
+ w1 += pD[2+ps*1] * pD[2+ps*0];
+ w2 += pD[2+ps*2] * pD[2+ps*0];
+ w3 += pD[2+ps*3] * pD[2+ps*0];
+ if(m>3)
+ {
+ w1 += pD[3+ps*1] * pD[3+ps*0];
+ w2 += pD[3+ps*2] * pD[3+ps*0];
+ w3 += pD[3+ps*3] * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[0+ps*1] += w1;
+ pD[0+ps*2] += w2;
+ pD[0+ps*3] += w3;
+ if(m>1)
+ {
+ pD[1+ps*1] += w1 * pD[1+ps*0];
+ pD[1+ps*2] += w2 * pD[1+ps*0];
+ pD[1+ps*3] += w3 * pD[1+ps*0];
+ if(m>2)
+ {
+ pD[2+ps*1] += w1 * pD[2+ps*0];
+ pD[2+ps*2] += w2 * pD[2+ps*0];
+ pD[2+ps*3] += w3 * pD[2+ps*0];
+ if(m>3)
+ {
+ pD[3+ps*1] += w1 * pD[3+ps*0];
+ pD[3+ps*2] += w2 * pD[3+ps*0];
+ pD[3+ps*3] += w3 * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+ }
+ if(m==1)
+ return;
+ // second column
+ beta = 0.0;
+ if(m>2)
+ {
+ tmp = pD[2+ps*1];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*1];
+ beta += tmp*tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ if(m>2)
+ {
+ pD[2+ps*1] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*1] *= tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] *= tmp;
+ pD[1+ii*sdd+ps*1] *= tmp;
+ pD[2+ii*sdd+ps*1] *= tmp;
+ pD[3+ii*sdd+ps*1] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[1+ps*2];
+ w3 = pD[1+ps*3];
+ if(m>2)
+ {
+ w2 += pD[2+ps*2] * pD[2+ps*1];
+ w3 += pD[2+ps*3] * pD[2+ps*1];
+ if(m>3)
+ {
+ w2 += pD[3+ps*2] * pD[3+ps*1];
+ w3 += pD[3+ps*3] * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[1+ps*2] += w2;
+ pD[1+ps*3] += w3;
+ if(m>2)
+ {
+ pD[2+ps*2] += w2 * pD[2+ps*1];
+ pD[2+ps*3] += w3 * pD[2+ps*1];
+ if(m>3)
+ {
+ pD[3+ps*2] += w2 * pD[3+ps*1];
+ pD[3+ps*3] += w3 * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+ }
+ if(m==2)
+ return;
+ // third column
+ beta = 0.0;
+ if(m>3)
+ {
+ tmp = pD[3+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ if(m>3)
+ {
+ pD[3+ps*2] *= tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] *= tmp;
+ pD[1+ii*sdd+ps*2] *= tmp;
+ pD[2+ii*sdd+ps*2] *= tmp;
+ pD[3+ii*sdd+ps*2] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[2+ps*3];
+ if(m>3)
+ {
+ w3 += pD[3+ps*3] * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ w3 = - dD[2] * w3;
+ pD[2+ps*3] += w3;
+ if(m>3)
+ {
+ pD[3+ps*3] += w3 * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+ }
+ if(m==3)
+ return;
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] *= tmp;
+ pD[1+ii*sdd+ps*3] *= tmp;
+ pD[2+ii*sdd+ps*3] *= tmp;
+ pD[3+ii*sdd+ps*3] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k; //m<n ? m : n;
+ double alpha, beta, tmp, w0;
+ double *pC00, *pC10, *pC01, *pC11;
+ int offset;
+ double *pD0 = pD-offD;
+ for(ii=0; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ tmp = pC10[1+offset];
+ beta += tmp*tmp;
+ tmp = pC10[2+offset];
+ beta += tmp*tmp;
+ tmp = pC10[3+offset];
+ beta += tmp*tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ pC10[0+offset] *= tmp;
+ pC10[1+offset] *= tmp;
+ pC10[2+offset] *= tmp;
+ pC10[3+offset] *= tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ pC00[0] = beta;
+ }
+ if(ii<n)
+ {
+ pC01 = pC00 + ps;
+ pC11 = pC10 + ps;
+ kmax = jmax;
+ kmax0 = jmax0;
+ jmax = n-ii-1;
+ jj = 0;
+ for( ; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ps*jj] * 1.0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+ w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+ w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ps*jj] += w0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ offset = offset-ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+ pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+ pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 2;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[1+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pD[1+ps*0];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] = tmp;
+ tmp = pC[1+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pD[2+ps*0];
+ d1 = pD[2+ps*1];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] = tmp;
+ tmp = pC[2+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] = tmp;
+ if(m>3)
+ {
+ d0 = pD[3+ps*0];
+ d1 = pD[3+ps*1];
+ d2 = pD[3+ps*2];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] = tmp;
+ tmp = pC[3+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ pC[0+ps*1] -= pW[1+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+ps*0];
+ pW[0+ldw*1] = tmp;
+ if(m>2)
+ {
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+ps*1];
+ pW[0+ldw*2] = tmp;
+ if(m>3)
+ {
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+ps*2];
+ pW[0+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ }
+
+ return;
+ }
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double c00, c01,
+ c10, c11,
+ c20, c21,
+ c30, c31;
+ double a0, a1, a2, a3, b0, b1;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 4;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[0+ldw*1] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] = tmp;
+ tmp = pC[1+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] = tmp;
+ tmp = pC[2+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] = tmp;
+ tmp = pC[3+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ }
+ // compute W^T *= T
+ pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+ pW[3+ldw*1] = pT[3+ldt*0]*pW[0+ldw*1] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[2+ldw*1] + pT[3+ldt*3]*pW[3+ldw*1];
+ pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+ pW[2+ldw*1] = pT[2+ldt*0]*pW[0+ldw*1] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[2+ldw*1];
+ pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[0+ldw*1] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[0+ldw*1] = pT[0+ldt*0]*pW[0+ldw*1];
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ldw*1];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ldw*1];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c30 -= b0;
+ b1 = pW[3+ldw*1];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[0+jj*sdc+ps*1] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[1+jj*sdc+ps*1] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[2+jj*sdc+ps*1] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[3+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[0+jj*sdc+ps*1] = c01;
+ pC[1+jj*sdc+ps*1] = c11;
+ pC[2+jj*sdc+ps*1] = c21;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ c01 = pC[ll+jj*sdc+ps*1];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[1+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[2+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[3+ldw*1];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ pC[ll+jj*sdc+ps*1] = c01;
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ }
+ // compute W^T *= T
+ pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+ pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+ pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c30 -= b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ }
+ }
+
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ for(ii=1; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ }
+ // second column
+ beta = 0.0;
+ for(ii=2; ii<n; ii++)
+ {
+ tmp = pD[1+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ for(ii=2; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ }
+ // third column
+ beta = 0.0;
+ for(ii=3; ii<n; ii++)
+ {
+ tmp = pD[2+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ for(ii=3; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[3+ps*2];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ w3 = - dD[2] * w3;
+ pD[3+ps*2] += w3;
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ }
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ tmp = pD[3+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp;
+ double w00, w01,
+ w10, w11,
+ w20, w21,
+ w30, w31;
+ double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+ double pT[4];
+ int ldt = 2;
+ double *pD0 = pD-offD;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first row
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ kmax = n-ii;
+ w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ // second row
+ pC11 = pC10+ps*1;
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC11[0+ps*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ps*jj] *= tmp;
+ }
+ // compute T
+ kmax = n-ii;
+ tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+ pT[0+ldt*0] = dD[ii+0];
+ pT[0+ldt*1] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldt*1] = dD[ii+1];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-2;
+ jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+ pC20 = pC20a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ pC20 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w10 = pC20[1+ps*0]*1.0 + pC20[1+ps*1]*pC00[0+ps*1];
+ w20 = pC20[2+ps*0]*1.0 + pC20[2+ps*1]*pC00[0+ps*1];
+ w30 = pC20[3+ps*0]*1.0 + pC20[3+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ w11 = pC20[1+ps*0]*0.0 + pC20[1+ps*1]*1.0;
+ w21 = pC20[2+ps*0]*0.0 + pC20[2+ps*1]*1.0;
+ w31 = pC20[3+ps*0]*0.0 + pC20[3+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w10 += pC20[1+ps*kk]*pC00[0+ps*kk];
+ w20 += pC20[2+ps*kk]*pC00[0+ps*kk];
+ w30 += pC20[3+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ w11 += pC20[1+ps*kk]*pC10[0+ps*kk];
+ w21 += pC20[2+ps*kk]*pC10[0+ps*kk];
+ w31 += pC20[3+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w11 = - w10*pT[0+ldt*1] - w11*pT[1+ldt*1];
+ w21 = - w20*pT[0+ldt*1] - w21*pT[1+ldt*1];
+ w31 = - w30*pT[0+ldt*1] - w31*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ w10 = - w10*pT[0+ldt*0];
+ w20 = - w20*pT[0+ldt*0];
+ w30 = - w30*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[1+ps*0] += w10*1.0 + w11*0.0;
+ pC20[2+ps*0] += w20*1.0 + w21*0.0;
+ pC20[3+ps*0] += w30*1.0 + w31*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ pC20[1+ps*1] += w10*pC00[0+ps*1] + w11*1.0;
+ pC20[2+ps*1] += w20*pC00[0+ps*1] + w21*1.0;
+ pC20[3+ps*1] += w30*pC00[0+ps*1] + w31*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ pC20[1+ps*kk] += w10*pC00[0+ps*kk] + w11*pC10[0+ps*kk];
+ pC20[2+ps*kk] += w20*pC00[0+ps*kk] + w21*pC10[0+ps*kk];
+ pC20[3+ps*kk] += w30*pC00[0+ps*kk] + w31*pC10[0+ps*kk];
+ }
+ pC20 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ if(ii<n)
+ {
+ kmax = n-ii;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ pC10 = pC10a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ pC10 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ w00 = pC10[0+ps*0];
+ w10 = pC10[1+ps*0];
+ w20 = pC10[2+ps*0];
+ w30 = pC10[3+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk]*pC00[0+ps*kk];
+ w10 += pC10[1+ps*kk]*pC00[0+ps*kk];
+ w20 += pC10[2+ps*kk]*pC00[0+ps*kk];
+ w30 += pC10[3+ps*kk]*pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ w10 = - w10*dD[ii];
+ w20 = - w20*dD[ii];
+ w30 = - w30*dD[ii];
+ pC10[0+ps*0] += w00;
+ pC10[1+ps*0] += w10;
+ pC10[2+ps*0] += w20;
+ pC10[3+ps*0] += w30;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00*pC00[0+ps*kk];
+ pC10[1+ps*kk] += w10*pC00[0+ps*kk];
+ pC10[2+ps*kk] += w20*pC00[0+ps*kk];
+ pC10[3+ps*kk] += w30*pC00[0+ps*kk];
+ }
+ pC10 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+ {
+ const int ps = 4;
+ int kk;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ // 0
+ // 1
+ v10 = pD[0+ps*1];
+ // 2
+ v10 += pD[1+ps*2]*pD[0+ps*2];
+ v20 = pD[0+ps*2];
+ v21 = pD[1+ps*2];
+ // 3
+ v10 += pD[1+ps*3]*pD[0+ps*3];
+ v20 += pD[2+ps*3]*pD[0+ps*3];
+ v21 += pD[2+ps*3]*pD[1+ps*3];
+ v30 = pD[0+ps*3];
+ v31 = pD[1+ps*3];
+ v32 = pD[2+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ v10 += pD[1+ps*kk]*pD[0+ps*kk];
+ v20 += pD[2+ps*kk]*pD[0+ps*kk];
+ v30 += pD[3+ps*kk]*pD[0+ps*kk];
+ v21 += pD[2+ps*kk]*pD[1+ps*kk];
+ v31 += pD[3+ps*kk]*pD[1+ps*kk];
+ v32 += pD[3+ps*kk]*pD[2+ps*kk];
+ }
+ pT[0+ps*0] = - dD[0];
+ pT[1+ps*1] = - dD[1];
+ pT[2+ps*2] = - dD[2];
+ pT[3+ps*3] = - dD[3];
+ pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+ pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+ pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+ pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+ pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ return;
+ }
+
+
+
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ pW[1+ps*0] = pD[1+ps*0];
+ pW[2+ps*0] = pD[2+ps*0];
+ pW[3+ps*0] = pD[3+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[1+ps*0] += pD[1+ps*1]*pV[0+ps*1];
+ pW[2+ps*0] += pD[2+ps*1]*pV[0+ps*1];
+ pW[3+ps*0] += pD[3+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ pW[1+ps*1] = pD[1+ps*1];
+ pW[2+ps*1] = pD[2+ps*1];
+ pW[3+ps*1] = pD[3+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[1+ps*0] += pD[1+ps*2]*pV[0+ps*2];
+ pW[2+ps*0] += pD[2+ps*2]*pV[0+ps*2];
+ pW[3+ps*0] += pD[3+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[1+ps*1] += pD[1+ps*2]*pV[1+ps*2];
+ pW[2+ps*1] += pD[2+ps*2]*pV[1+ps*2];
+ pW[3+ps*1] += pD[3+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ pW[1+ps*2] = pD[1+ps*2];
+ pW[2+ps*2] = pD[2+ps*2];
+ pW[3+ps*2] = pD[3+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[1+ps*0] += pD[1+ps*3]*pV[0+ps*3];
+ pW[2+ps*0] += pD[2+ps*3]*pV[0+ps*3];
+ pW[3+ps*0] += pD[3+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[1+ps*1] += pD[1+ps*3]*pV[1+ps*3];
+ pW[2+ps*1] += pD[2+ps*3]*pV[1+ps*3];
+ pW[3+ps*1] += pD[3+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[1+ps*2] += pD[1+ps*3]*pV[2+ps*3];
+ pW[2+ps*2] += pD[2+ps*3]*pV[2+ps*3];
+ pW[3+ps*2] += pD[3+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ pW[1+ps*3] = pD[1+ps*3];
+ pW[2+ps*3] = pD[2+ps*3];
+ pW[3+ps*3] = pD[3+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[1+ps*0] += pD[1+ps*kk]*pV[0+ps*kk];
+ pW[2+ps*0] += pD[2+ps*kk]*pV[0+ps*kk];
+ pW[3+ps*0] += pD[3+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[1+ps*1] += pD[1+ps*kk]*pV[1+ps*kk];
+ pW[2+ps*1] += pD[2+ps*kk]*pV[1+ps*kk];
+ pW[3+ps*1] += pD[3+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[1+ps*2] += pD[1+ps*kk]*pV[2+ps*kk];
+ pW[2+ps*2] += pD[2+ps*kk]*pV[2+ps*kk];
+ pW[3+ps*2] += pD[3+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ pW[1+ps*3] += pD[1+ps*kk]*pV[3+ps*kk];
+ pW[2+ps*3] += pD[2+ps*kk]*pV[3+ps*kk];
+ pW[3+ps*3] += pD[3+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ pW[1+ps*3] = pW[1+ps*0]*pT[0+ps*3] + pW[1+ps*1]*pT[1+ps*3] + pW[1+ps*2]*pT[2+ps*3] + pW[1+ps*3]*pT[3+ps*3];
+ pW[2+ps*3] = pW[2+ps*0]*pT[0+ps*3] + pW[2+ps*1]*pT[1+ps*3] + pW[2+ps*2]*pT[2+ps*3] + pW[2+ps*3]*pT[3+ps*3];
+ pW[3+ps*3] = pW[3+ps*0]*pT[0+ps*3] + pW[3+ps*1]*pT[1+ps*3] + pW[3+ps*2]*pT[2+ps*3] + pW[3+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ pW[1+ps*2] = pW[1+ps*0]*pT[0+ps*2] + pW[1+ps*1]*pT[1+ps*2] + pW[1+ps*2]*pT[2+ps*2];
+ pW[2+ps*2] = pW[2+ps*0]*pT[0+ps*2] + pW[2+ps*1]*pT[1+ps*2] + pW[2+ps*2]*pT[2+ps*2];
+ pW[3+ps*2] = pW[3+ps*0]*pT[0+ps*2] + pW[3+ps*1]*pT[1+ps*2] + pW[3+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ pW[1+ps*1] = pW[1+ps*0]*pT[0+ps*1] + pW[1+ps*1]*pT[1+ps*1];
+ pW[2+ps*1] = pW[2+ps*0]*pT[0+ps*1] + pW[2+ps*1]*pT[1+ps*1];
+ pW[3+ps*1] = pW[3+ps*0]*pT[0+ps*1] + pW[3+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ pW[1+ps*0] = pW[1+ps*0]*pT[0+ps*0];
+ pW[2+ps*0] = pW[2+ps*0]*pT[0+ps*0];
+ pW[3+ps*0] = pW[3+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ pD[1+ps*0] += pW[1+ps*0];
+ pD[2+ps*0] += pW[2+ps*0];
+ pD[3+ps*0] += pW[3+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ pD[1+ps*1] += pW[1+ps*0]*pV[0+ps*1] + pW[1+ps*1];
+ pD[2+ps*1] += pW[2+ps*0]*pV[0+ps*1] + pW[2+ps*1];
+ pD[3+ps*1] += pW[3+ps*0]*pV[0+ps*1] + pW[3+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ pD[1+ps*2] += pW[1+ps*0]*pV[0+ps*2] + pW[1+ps*1]*pV[1+ps*2] + pW[1+ps*2];
+ pD[2+ps*2] += pW[2+ps*0]*pV[0+ps*2] + pW[2+ps*1]*pV[1+ps*2] + pW[2+ps*2];
+ pD[3+ps*2] += pW[3+ps*0]*pV[0+ps*2] + pW[3+ps*1]*pV[1+ps*2] + pW[3+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ pD[1+ps*3] += pW[1+ps*0]*pV[0+ps*3] + pW[1+ps*1]*pV[1+ps*3] + pW[1+ps*2]*pV[2+ps*3] + pW[1+ps*3];
+ pD[2+ps*3] += pW[2+ps*0]*pV[0+ps*3] + pW[2+ps*1]*pV[1+ps*3] + pW[2+ps*2]*pV[2+ps*3] + pW[2+ps*3];
+ pD[3+ps*3] += pW[3+ps*0]*pV[0+ps*3] + pW[3+ps*1]*pV[1+ps*3] + pW[3+ps*2]*pV[2+ps*3] + pW[3+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ pD[1+ps*kk] += pW[1+ps*0]*pV[0+ps*kk] + pW[1+ps*1]*pV[1+ps*kk] + pW[1+ps*2]*pV[2+ps*kk] + pW[1+ps*3]*pV[3+ps*kk];
+ pD[2+ps*kk] += pW[2+ps*0]*pV[0+ps*kk] + pW[2+ps*1]*pV[1+ps*kk] + pW[2+ps*2]*pV[2+ps*kk] + pW[2+ps*3]*pV[3+ps*kk];
+ pD[3+ps*kk] += pW[3+ps*0]*pV[0+ps*kk] + pW[3+ps*1]*pV[1+ps*kk] + pW[3+ps*2]*pV[2+ps*kk] + pW[3+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
diff --git a/kernel/c99/kernel_dgetrf_pivot_4_lib4.c b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..787322e
--- /dev/null
+++ b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,779 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numbering, starting from 0
+void didamax_lib4(int n, int offset, double *pA, int sda, int *p_idamax, double *p_amax)
+ {
+
+ int idamax, ii;
+ double tmp, amax;
+
+ p_idamax[0] = -1;
+ if(n<1)
+ return;
+
+ const int bs = 4;
+
+ int na = (bs - offset%bs)%bs;
+ na = n<na ? n : na;
+
+ amax = -1.0;
+ ii = 0;
+ if(na>0)
+ {
+ for( ; ii<na; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ tmp = fabs(pA[1]);
+ if(tmp>amax)
+ {
+ idamax = ii+1;
+ amax = tmp;
+ }
+ tmp = fabs(pA[2]);
+ if(tmp>amax)
+ {
+ idamax = ii+2;
+ amax = tmp;
+ }
+ tmp = fabs(pA[3]);
+ if(tmp>amax)
+ {
+ idamax = ii+3;
+ amax = tmp;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<n; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+
+ p_amax[0] = amax;
+ p_idamax[0] = idamax;
+
+ return;
+
+ }
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ double
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+ didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ // second column
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ // third column
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+
+ // fourth column
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+
+ return;
+
+ }
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ double
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+
+ // find pivot & scale
+ didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ if(m>=4)
+ {
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {1,2,3}
+ {
+ if(m>1)
+ {
+ pA[1+bs*0] *= tmp0;
+ if(m>2)
+ pA[2+bs*0] *= tmp0;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+ return;
+
+ // second column
+
+ // correct
+ if(m>=4)
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*1];
+ tmp2 -= pA[2+bs*0] * u_01;
+ pA[2+bs*1] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ if(m>=4)
+ {
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ if(m>2)
+ pA[2+bs*1] *= tmp1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ if(n==2)
+ return;
+
+ // third column
+
+ // correct
+ if(m>=4)
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ if(m>2)
+ {
+ didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ if(m>=4)
+ {
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n<4)
+ return;
+
+ // fourth column
+
+ // correct
+ if(m>=4)
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ if(m>2)
+ {
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ }
+ }
+
+ if(m>3)
+ {
+ // find pivot & scale
+ didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+
+
diff --git a/kernel/c99/kernel_dsymv_4_lib4.c b/kernel/c99/kernel_dsymv_4_lib4.c
new file mode 100644
index 0000000..bed4300
--- /dev/null
+++ b/kernel/c99/kernel_dsymv_4_lib4.c
@@ -0,0 +1,1024 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t)
+ {
+
+ kernel_dgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x_n, double *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ double *x_t = x_n;
+ double *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x_n, double *z_n)
+ {
+
+ kernel_dsymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
diff --git a/kernel/c99/kernel_sgecp_lib4.c b/kernel/c99/kernel_sgecp_lib4.c
new file mode 100644
index 0000000..de5b704
--- /dev/null
+++ b/kernel/c99/kernel_sgecp_lib4.c
@@ -0,0 +1,1148 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgesc_4_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+ A[2+bs*0] *= alpha;
+ A[3+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_3_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+ A[2+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_2_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_1_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[1+bs*0];
+ B[1+bs*0] = A0[2+bs*0];
+ B[2+bs*0] = A0[3+bs*0];
+ B[3+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ B[2+bs*1] = A[2+bs*1];
+ B[3+bs*1] = A[3+bs*1];
+
+ B[3+bs*2] = A[3+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[1+bs*0];
+ B[1+bs*0] = A0[2+bs*0];
+ B[2+bs*0] = A0[3+bs*0];
+ B[3+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+0*bs] = A0[2+0*bs];
+ B[2+0*bs] = A0[3+0*bs];
+ B[3+0*bs] = A1[0+0*bs];
+
+ B[2+1*bs] = A0[3+1*bs];
+ B[3+1*bs] = A1[0+1*bs];
+
+ B[3+2*bs] = A1[0+2*bs];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle}
+
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ B[2+bs*1] = A1[0+bs*1];
+ B[3+bs*1] = A1[1+bs*1];
+
+ B[3+bs*2] = A1[1+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ B[2+bs*1] = A1[1+bs*1];
+ B[3+bs*1] = A1[2+bs*1];
+
+ B[3+bs*2] = A1[2+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ B[2+bs*1] = A[2+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ B[2+bs*1] = A1[0+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ B[2+bs*1] = A1[1+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 1x1 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 1x1 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgemm_4x4_lib4.c b/kernel/c99/kernel_sgemm_4x4_lib4.c
new file mode 100644
index 0000000..243d559
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_4x4_lib4.c
@@ -0,0 +1,6094 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ float
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
+void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(kmax, alpha, A, B, sdb, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+// c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+// c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+// c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+// c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+// c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+// c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ k = 0;
+
+ // k = 0
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 1
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 2
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ float *D1;
+
+ int k;
+
+ B += offsetB;
+
+ k = 0;
+
+ if(offsetB==0)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==1)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 5
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else // if(offetB==3)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ store:
+
+ c_00 = alpha[0]*c_00;
+ c_10 = alpha[0]*c_10;
+ c_20 = alpha[0]*c_20;
+ c_30 = alpha[0]*c_30;
+
+ c_01 = alpha[0]*c_01;
+ c_11 = alpha[0]*c_11;
+ c_21 = alpha[0]*c_21;
+ c_31 = alpha[0]*c_31;
+
+ c_02 = alpha[0]*c_02;
+ c_12 = alpha[0]*c_12;
+ c_22 = alpha[0]*c_22;
+ c_32 = alpha[0]*c_32;
+
+ c_03 = alpha[0]*c_03;
+ c_13 = alpha[0]*c_13;
+ c_23 = alpha[0]*c_23;
+ c_33 = alpha[0]*c_33;
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+// c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+// c_02 = C[0+bs*2] + c_02;
+// c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+// c_03 = C[0+bs*3] + c_03;
+// c_13 = C[1+bs*3] + c_13;
+// c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(c_00>0)
+ {
+ c_00 = sqrt(c_00);
+ tmp = 1.0/c_00;
+ }
+ else
+ {
+ c_00 = 0.0;
+ tmp = 0.0;
+ }
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ c_11 -= c_10 * c_10;
+ c_21 -= c_20 * c_10;
+ c_31 -= c_30 * c_10;
+ if(c_11>0)
+ {
+ c_11 = sqrt(c_11);
+ tmp = 1.0/c_11;
+ }
+ else
+ {
+ c_11 = 0.0;
+ tmp = 0.0;
+ }
+ c_21 *= tmp;
+ c_31 *= tmp;
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ c_22 -= c_20 * c_20;
+ c_32 -= c_30 * c_20;
+ c_22 -= c_21 * c_21;
+ c_32 -= c_31 * c_21;
+ if(c_22>0)
+ {
+ c_22 = sqrt(c_22);
+ tmp = 1.0/c_22;
+ }
+ else
+ {
+ c_22 = 0.0;
+ tmp = 0.0;
+ }
+ c_32 *= tmp;
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ c_33 -= c_30 * c_30;
+ c_33 -= c_31 * c_31;
+ c_33 -= c_32 * c_32;
+ if(c_33>0)
+ {
+ c_33 = sqrt(c_33);
+ tmp = 1.0/c_33;
+ }
+ else
+ {
+ c_33 = 0.0;
+ tmp = 0.0;
+ }
+ inv_diag_D[3] = tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+
+ if(kn>3)
+ {
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+ tmp = E[2+bs*3];
+ c_02 -= c_03 * tmp;
+ c_12 -= c_13 * tmp;
+ c_22 -= c_23 * tmp;
+ c_32 -= c_33 * tmp;
+ tmp = E[1+bs*3];
+ c_01 -= c_03 * tmp;
+ c_11 -= c_13 * tmp;
+ c_21 -= c_23 * tmp;
+ c_31 -= c_33 * tmp;
+ tmp = E[0+bs*3];
+ c_00 -= c_03 * tmp;
+ c_10 -= c_13 * tmp;
+ c_20 -= c_23 * tmp;
+ c_30 -= c_33 * tmp;
+ }
+
+ if(kn>2)
+ {
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+ tmp = E[1+bs*2];
+ c_01 -= c_02 * tmp;
+ c_11 -= c_12 * tmp;
+ c_21 -= c_22 * tmp;
+ c_31 -= c_32 * tmp;
+ tmp = E[0+bs*2];
+ c_00 -= c_02 * tmp;
+ c_10 -= c_12 * tmp;
+ c_20 -= c_22 * tmp;
+ c_30 -= c_32 * tmp;
+ }
+
+ if(kn>1)
+ {
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+ tmp = E[0+bs*1];
+ c_00 -= c_01 * tmp;
+ c_10 -= c_11 * tmp;
+ c_20 -= c_21 * tmp;
+ c_30 -= c_31 * tmp;
+ }
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // factorization
+
+ // first column
+ tmp = 1.0 / c_00;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ // second column
+ c_11 -= c_10 * c_01;
+ c_21 -= c_20 * c_01;
+ c_31 -= c_30 * c_01;
+
+ tmp = 1.0 / c_11;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ // third column
+ c_12 -= c_10 * c_02;
+ c_22 -= c_20 * c_02;
+ c_32 -= c_30 * c_02;
+
+ c_22 -= c_21 * c_12;
+ c_32 -= c_31 * c_12;
+
+ tmp = 1.0 / c_22;
+ c_32 *= tmp;
+
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ // fourth column
+ c_13 -= c_10 * c_03;
+ c_23 -= c_20 * c_03;
+ c_33 -= c_30 * c_03;
+
+ c_23 -= c_21 * c_13;
+ c_33 -= c_31 * c_13;
+
+ c_33 -= c_32 * c_23;
+
+ tmp = 1.0 / c_33;
+
+ inv_diag_D[3] = tmp;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_1, e_2, e_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solution
+
+ if(km==1)
+ goto store;
+
+ e_1 = E[1+bs*0];
+ e_2 = E[2+bs*0];
+ e_3 = E[3+bs*0];
+ c_10 -= e_1 * c_00;
+ c_20 -= e_2 * c_00;
+ c_30 -= e_3 * c_00;
+ c_11 -= e_1 * c_01;
+ c_21 -= e_2 * c_01;
+ c_31 -= e_3 * c_01;
+ c_12 -= e_1 * c_02;
+ c_22 -= e_2 * c_02;
+ c_32 -= e_3 * c_02;
+ c_13 -= e_1 * c_03;
+ c_23 -= e_2 * c_03;
+ c_33 -= e_3 * c_03;
+
+ if(km==2)
+ goto store;
+
+ e_2 = E[2+bs*1];
+ e_3 = E[3+bs*1];
+ c_20 -= e_2 * c_10;
+ c_30 -= e_3 * c_10;
+ c_21 -= e_2 * c_11;
+ c_31 -= e_3 * c_11;
+ c_22 -= e_2 * c_12;
+ c_32 -= e_3 * c_12;
+ c_23 -= e_2 * c_13;
+ c_33 -= e_3 * c_13;
+
+ if(km==3)
+ goto store;
+
+ e_3 = E[3+bs*2];
+ c_30 -= e_3 * c_20;
+ c_31 -= e_3 * c_21;
+ c_32 -= e_3 * c_22;
+ c_33 -= e_3 * c_23;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solve
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_10 *= e_00;
+ c_20 *= e_00;
+ c_30 *= e_00;
+
+ if(kn==1)
+ goto store;
+
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_01 -= c_00 * e_01;
+ c_11 -= c_10 * e_01;
+ c_21 -= c_20 * e_01;
+ c_31 -= c_30 * e_01;
+ c_01 *= e_11;
+ c_11 *= e_11;
+ c_21 *= e_11;
+ c_31 *= e_11;
+
+ if(kn==2)
+ goto store;
+
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_02 -= c_00 * e_02;
+ c_12 -= c_10 * e_02;
+ c_22 -= c_20 * e_02;
+ c_32 -= c_30 * e_02;
+ c_02 -= c_01 * e_12;
+ c_12 -= c_11 * e_12;
+ c_22 -= c_21 * e_12;
+ c_32 -= c_31 * e_12;
+ c_02 *= e_22;
+ c_12 *= e_22;
+ c_22 *= e_22;
+ c_32 *= e_22;
+
+ if(kn==3)
+ goto store;
+
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_03 -= c_00 * e_03;
+ c_13 -= c_10 * e_03;
+ c_23 -= c_20 * e_03;
+ c_33 -= c_30 * e_03;
+ c_03 -= c_01 * e_13;
+ c_13 -= c_11 * e_13;
+ c_23 -= c_21 * e_13;
+ c_33 -= c_31 * e_13;
+ c_03 -= c_02 * e_23;
+ c_13 -= c_12 * e_23;
+ c_23 -= c_22 * e_23;
+ c_33 -= c_32 * e_23;
+ c_03 *= e_33;
+ c_13 *= e_33;
+ c_23 *= e_33;
+ c_33 *= e_33;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+
+ // solve
+
+ if(km>3)
+ {
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_30 *= e_33;
+ c_31 *= e_33;
+ c_32 *= e_33;
+ c_33 *= e_33;
+ c_00 -= e_03 * c_30;
+ c_01 -= e_03 * c_31;
+ c_02 -= e_03 * c_32;
+ c_03 -= e_03 * c_33;
+ c_10 -= e_13 * c_30;
+ c_11 -= e_13 * c_31;
+ c_12 -= e_13 * c_32;
+ c_13 -= e_13 * c_33;
+ c_20 -= e_23 * c_30;
+ c_21 -= e_23 * c_31;
+ c_22 -= e_23 * c_32;
+ c_23 -= e_23 * c_33;
+ }
+
+ if(km>2)
+ {
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_20 *= e_22;
+ c_21 *= e_22;
+ c_22 *= e_22;
+ c_23 *= e_22;
+ c_00 -= e_02 * c_20;
+ c_01 -= e_02 * c_21;
+ c_02 -= e_02 * c_22;
+ c_03 -= e_02 * c_23;
+ c_10 -= e_12 * c_20;
+ c_11 -= e_12 * c_21;
+ c_12 -= e_12 * c_22;
+ c_13 -= e_12 * c_23;
+ }
+
+ if(km>1)
+ {
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_10 *= e_11;
+ c_11 *= e_11;
+ c_12 *= e_11;
+ c_13 *= e_11;
+ c_00 -= e_01 * c_10;
+ c_01 -= e_01 * c_11;
+ c_02 -= e_01 * c_12;
+ c_03 -= e_01 * c_13;
+ }
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_01 *= e_00;
+ c_02 *= e_00;
+ c_03 *= e_00;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
diff --git a/kernel/c99/kernel_sgemm_diag_lib4.c b/kernel/c99/kernel_sgemm_diag_lib4.c
new file mode 100644
index 0000000..93df707
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_diag_lib4.c
@@ -0,0 +1,1112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_0;
+ c_2 = a_2 * b_0;
+ c_3 = a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = a_0 * b_1;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_1;
+ c_3 = a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = a_0 * b_2;
+ c_1 = a_1 * b_2;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = a_0 * b_3;
+ c_1 = a_1 * b_3;
+ c_2 = a_2 * b_3;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1,
+ b_0, b_1,
+ c_0, c_1;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
diff --git a/kernel/c99/kernel_sgemv_4_lib4.c b/kernel/c99/kernel_sgemv_4_lib4.c
new file mode 100644
index 0000000..03975f4
--- /dev/null
+++ b/kernel/c99/kernel_sgemv_4_lib4.c
@@ -0,0 +1,1010 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ x_0 = x[1];
+
+ y_0 += A[0+bs*1] * x_0;
+ y_1 += A[1+bs*1] * x_0;
+ y_2 += A[2+bs*1] * x_0;
+ y_3 += A[3+bs*1] * x_0;
+
+ x_0 = x[2];
+
+ y_0 += A[0+bs*2] * x_0;
+ y_1 += A[1+bs*2] * x_0;
+ y_2 += A[2+bs*2] * x_0;
+ y_3 += A[3+bs*2] * x_0;
+
+ x_0 = x[3];
+
+ y_0 += A[0+bs*3] * x_0;
+ y_1 += A[1+bs*3] * x_0;
+ y_2 += A[2+bs*3] * x_0;
+ y_3 += A[3+bs*3] * x_0;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(k0<=0 & k1>3)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ if(k0<=0 & k1>0) z[0] = y_0;
+ if(k0<=1 & k1>1) z[1] = y_1;
+ if(k0<=2 & k1>2) z[2] = y_2;
+ if(k0<=3 & k1>3) z[3] = y_3;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
+ {
+
+ kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
+ {
+
+ kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
+ {
+
+ const int bs = 4;
+
+ int k, kend;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ if(offA!=0) // 1, 2, 3
+ {
+ kend = 4-offA<kmax ? 4-offA : kmax;
+ for(; k<kend; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+ A += bs*(sda-1);
+ }
+ for(; k<kmax-bs+1; k+=bs)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
+ {
+
+ kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
+ {
+
+ kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[1+bs*0] * x_0;
+ y_2 -= A[2+bs*0] * x_0;
+ y_3 -= A[3+bs*0] * x_0;
+
+ y_0 -= A[0+bs*1] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[2+bs*1] * x_1;
+ y_3 -= A[3+bs*1] * x_1;
+
+ y_0 -= A[0+bs*2] * x_2;
+ y_1 -= A[1+bs*2] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[3+bs*2] * x_2;
+
+ y_0 -= A[0+bs*3] * x_3;
+ y_1 -= A[1+bs*3] * x_3;
+ y_2 -= A[2+bs*3] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ float
+ a_00, a_10, a_20, a_30,
+ a_11, a_21, a_31;
+
+ // a_00
+ a_00 = inv_diag_A[0];
+ a_10 = A[1+bs*0];
+ a_20 = A[2+bs*0];
+ a_30 = A[3+bs*0];
+ y_0 *= a_00;
+ z[0] = y_0;
+ y_1 -= a_10 * y_0;
+ y_2 -= a_20 * y_0;
+ y_3 -= a_30 * y_0;
+
+ if(kn==1)
+ {
+ if(km==1)
+ return;
+ y[1] = y_1;
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_11
+ a_11 = inv_diag_A[1];
+ a_21 = A[2+bs*1];
+ a_31 = A[3+bs*1];
+ y_1 *= a_11;
+ z[1] = y_1;
+ y_2 -= a_21 * y_1;
+ y_3 -= a_31 * y_1;
+
+ if(kn==2)
+ {
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_22
+ a_00 = inv_diag_A[2];
+ a_10 = A[3+bs*2];
+ y_2 *= a_00;
+ z[2] = y_2;
+ y_3 -= a_10 * y_2;
+
+ if(kn==3)
+ {
+ if(km==3)
+ return;
+ y[3] = y_3;
+
+ return;
+ }
+
+ // a_33
+ a_11 = inv_diag_A[3];
+ y_3 *= a_11;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+ y_3 -= A[1+bs*3] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[2+bs*3] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_3 *= inv_diag_A[3];
+ z[3] = y_3;
+
+ y_2 -= A[3+bs*2] * y_3;
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+ y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0;
+
+ k = 3;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_3 = x[3];
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 3;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2;
+ y_1 -= A[2+bs*1]*y_2;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0;
+
+ k = 2;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 2;
+ x += 2;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0;
+
+ k = 1;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 1;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+/* y_1 += A[1+bs*0] * x_0;*/
+/* y_2 += A[2+bs*0] * x_0;*/
+/* y_3 += A[3+bs*0] * x_0;*/
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+/* y_2 += A[2+bs*1] * x_1;*/
+/* y_3 += A[3+bs*1] * x_1;*/
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+/* y_3 += A[3+bs*2] * x_2;*/
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ k=4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[2+bs*1] * x_1;
+ y_3 += A[3+bs*1] * x_1;
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[3+bs*2] * x_2;
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-4; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+/* y_0 += A[1+bs*0] * x_1;*/
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+/* y_0 += A[2+bs*0] * x_2;*/
+/* y_1 += A[2+bs*1] * x_2;*/
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+/* y_0 += A[3+bs*0] * x_3;*/
+/* y_1 += A[3+bs*1] * x_3;*/
+/* y_2 += A[3+bs*2] * x_3;*/
+ y_3 += A[3+bs*3] * x_3;
+
+// A += sda*bs;
+// x += 4;
+
+ // store_vs
+ store:
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
+ {
+
+ kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+ }
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgetrf_pivot_4_lib4.c b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..fdec8de
--- /dev/null
+++ b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
@@ -0,0 +1,786 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_s_aux.h"
+
+
+
+// C numbering, starting from 0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
+ {
+
+ int idamax, ii;
+ float tmp, amax;
+
+ p_idamax[0] = -1;
+ if(n<1)
+ return;
+
+ const int bs = 4;
+
+ int na = (bs - offset%bs)%bs;
+ na = n<na ? n : na;
+
+ amax = -1.0;
+ ii = 0;
+ if(na>0)
+ {
+ for( ; ii<na; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ tmp = fabs(pA[1]);
+ if(tmp>amax)
+ {
+ idamax = ii+1;
+ amax = tmp;
+ }
+ tmp = fabs(pA[2]);
+ if(tmp>amax)
+ {
+ idamax = ii+2;
+ amax = tmp;
+ }
+ tmp = fabs(pA[3]);
+ if(tmp>amax)
+ {
+ idamax = ii+3;
+ amax = tmp;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<n; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+
+ p_amax[0] = amax;
+ p_idamax[0] = idamax;
+
+ return;
+
+ }
+#endif
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ float
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ float
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+ sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ // second column
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ // third column
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+
+ // fourth column
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ float
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ float
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+
+ // find pivot & scale
+ sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ if(m>=4)
+ {
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {1,2,3}
+ {
+ if(m>1)
+ {
+ pA[1+bs*0] *= tmp0;
+ if(m>2)
+ pA[2+bs*0] *= tmp0;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+ return;
+
+ // second column
+
+ // correct
+ if(m>=4)
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*1];
+ tmp2 -= pA[2+bs*0] * u_01;
+ pA[2+bs*1] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ if(m>=4)
+ {
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ if(m>2)
+ pA[2+bs*1] *= tmp1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ if(n==2)
+ return;
+
+ // third column
+
+ // correct
+ if(m>=4)
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ if(m>2)
+ {
+ sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ if(m>=4)
+ {
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n<4)
+ return;
+
+ // fourth column
+
+ // correct
+ if(m>=4)
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ if(m>2)
+ {
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ }
+ }
+
+ if(m>3)
+ {
+ // find pivot & scale
+ sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_ssymv_4_lib4.c b/kernel/c99/kernel_ssymv_4_lib4.c
new file mode 100644
index 0000000..5512154
--- /dev/null
+++ b/kernel/c99/kernel_ssymv_4_lib4.c
@@ -0,0 +1,1025 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
+ {
+
+ kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ float *x_t = x_n;
+ float *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
+ {
+
+ kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
+