Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/c99/Makefile b/kernel/c99/Makefile
new file mode 100644
index 0000000..55d54ef
--- /dev/null
+++ b/kernel/c99/Makefile
@@ -0,0 +1,80 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemv_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemv_4_lib4.o
+#OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+OBJS +=
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/c99/kernel_dgemm_4x4_lib4.c b/kernel/c99/kernel_dgemm_4x4_lib4.c
new file mode 100644
index 0000000..167e356
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_4x4_lib4.c
@@ -0,0 +1,6825 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC)
+void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	k = 0;
+	if(offsetB!=0)
+		{
+		if(offsetB==1)
+			{
+
+			B += 1;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		else if(offsetB==2)
+			{
+
+			B += 2;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		else // if(offsetB==3)
+			{
+
+			B += 3;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		}
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}	
+	
+	scale:
+
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+	{
+	kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0,
+		c_10=0, c_11=0,
+		c_20=0, c_21=0, c_22=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0,
+		c_10=0, c_11=0,
+		c_20=0, c_21=0, c_22=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[2+bs*2] = c_22;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	k = 0;
+
+	// k = 0
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 1
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 2
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double *D1;
+	
+	int k;
+
+	B += offsetB;
+
+	k = 0;
+
+	if(offsetB==0)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==1)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==2)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 5
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else // if(offetB==3)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	store:
+	
+	c_00 = alpha[0]*c_00;
+	c_10 = alpha[0]*c_10;
+	c_20 = alpha[0]*c_20;
+	c_30 = alpha[0]*c_30;
+
+	c_01 = alpha[0]*c_01;
+	c_11 = alpha[0]*c_11;
+	c_21 = alpha[0]*c_21;
+	c_31 = alpha[0]*c_31;
+
+	c_02 = alpha[0]*c_02;
+	c_12 = alpha[0]*c_12;
+	c_22 = alpha[0]*c_22;
+	c_32 = alpha[0]*c_32;
+
+	c_03 = alpha[0]*c_03;
+	c_13 = alpha[0]*c_13;
+	c_23 = alpha[0]*c_23;
+	c_33 = alpha[0]*c_33;
+
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC)  || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
+	{
+	kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+//	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+//	c_02 = C[0+bs*2] + c_02;
+//	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+//	c_03 = C[0+bs*3] + c_03;
+//	c_13 = C[1+bs*3] + c_13;
+//	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(c_00>0)
+		{
+		c_00 = sqrt(c_00);
+		tmp = 1.0/c_00;
+		}
+	else
+		{
+		c_00 = 0.0;
+		tmp = 0.0;
+		}
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+	
+	c_11 -= c_10 * c_10;
+	c_21 -= c_20 * c_10;
+	c_31 -= c_30 * c_10;
+	if(c_11>0)
+		{
+		c_11 = sqrt(c_11);
+		tmp = 1.0/c_11;
+		}
+	else
+		{
+		c_11 = 0.0;
+		tmp = 0.0;
+		}
+	c_21 *= tmp;
+	c_31 *= tmp;
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+	
+	c_22 -= c_20 * c_20;
+	c_32 -= c_30 * c_20;
+	c_22 -= c_21 * c_21;
+	c_32 -= c_31 * c_21;
+	if(c_22>0)
+		{
+		c_22 = sqrt(c_22);
+		tmp = 1.0/c_22;
+		}
+	else
+		{
+		c_22 = 0.0;
+		tmp = 0.0;
+		}
+	c_32 *= tmp;
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+	
+	c_33 -= c_30 * c_30;
+	c_33 -= c_31 * c_31;
+	c_33 -= c_32 * c_32;
+	if(c_33>0)
+		{
+		c_33 = sqrt(c_33);
+		tmp = 1.0/c_33;
+		}
+	else
+		{
+		c_33 = 0.0;
+		tmp = 0.0;
+		}
+	inv_diag_D[3] = tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
+	{
+	kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+	double alpha = 1.0;
+	double beta = 1.0;
+	kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
+	{
+	double alpha = 1.0;
+	double beta = 1.0;
+	kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+	tmp = inv_diag_E[1];
+	c_01 *= tmp;
+	c_11 *= tmp;
+	c_21 *= tmp;
+	c_31 *= tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+	tmp = inv_diag_E[2];
+	c_02 *= tmp;
+	c_12 *= tmp;
+	c_22 *= tmp;
+	c_32 *= tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+	tmp = inv_diag_E[3];
+	c_03 *= tmp;
+	c_13 *= tmp;
+	c_23 *= tmp;
+	c_33 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+	double alpha = 1.0;
+	double beta  = 1.0;
+	kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	double alpha = 1.0;
+	double beta  = 1.0;
+	kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
+	{
+	kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+
+	if(kn>3)
+		{
+		tmp = inv_diag_E[3];
+		c_03 *= tmp;
+		c_13 *= tmp;
+		c_23 *= tmp;
+		c_33 *= tmp;
+		tmp = E[2+bs*3];
+		c_02 -= c_03 * tmp;
+		c_12 -= c_13 * tmp;
+		c_22 -= c_23 * tmp;
+		c_32 -= c_33 * tmp;
+		tmp = E[1+bs*3];
+		c_01 -= c_03 * tmp;
+		c_11 -= c_13 * tmp;
+		c_21 -= c_23 * tmp;
+		c_31 -= c_33 * tmp;
+		tmp = E[0+bs*3];
+		c_00 -= c_03 * tmp;
+		c_10 -= c_13 * tmp;
+		c_20 -= c_23 * tmp;
+		c_30 -= c_33 * tmp;
+		}
+
+	if(kn>2)
+		{
+		tmp = inv_diag_E[2];
+		c_02 *= tmp;
+		c_12 *= tmp;
+		c_22 *= tmp;
+		c_32 *= tmp;
+		tmp = E[1+bs*2];
+		c_01 -= c_02 * tmp;
+		c_11 -= c_12 * tmp;
+		c_21 -= c_22 * tmp;
+		c_31 -= c_32 * tmp;
+		tmp = E[0+bs*2];
+		c_00 -= c_02 * tmp;
+		c_10 -= c_12 * tmp;
+		c_20 -= c_22 * tmp;
+		c_30 -= c_32 * tmp;
+		}
+
+	if(kn>1)
+		{
+		tmp = inv_diag_E[1];
+		c_01 *= tmp;
+		c_11 *= tmp;
+		c_21 *= tmp;
+		c_31 *= tmp;
+		tmp = E[0+bs*1];
+		c_00 -= c_01 * tmp;
+		c_10 -= c_11 * tmp;
+		c_20 -= c_21 * tmp;
+		c_30 -= c_31 * tmp;
+		}
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// factorization
+
+	// first column
+	tmp = 1.0 / c_00;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+
+	// second column
+	c_11 -= c_10 * c_01;
+	c_21 -= c_20 * c_01;
+	c_31 -= c_30 * c_01;
+
+	tmp = 1.0 / c_11;
+	c_21 *= tmp;
+	c_31 *= tmp;
+	
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+
+	// third column
+	c_12 -= c_10 * c_02;
+	c_22 -= c_20 * c_02;
+	c_32 -= c_30 * c_02;
+
+	c_22 -= c_21 * c_12;
+	c_32 -= c_31 * c_12;
+
+	tmp = 1.0 / c_22;
+	c_32 *= tmp;
+
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+
+	// fourth column
+	c_13 -= c_10 * c_03;
+	c_23 -= c_20 * c_03;
+	c_33 -= c_30 * c_03;
+
+	c_23 -= c_21 * c_13;
+	c_33 -= c_31 * c_13;
+
+	c_33 -= c_32 * c_23;
+
+	tmp = 1.0 / c_33;
+
+	inv_diag_D[3] = tmp;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
+	{
+	kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_1, e_2, e_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// solution
+
+	if(km==1)
+		goto store;
+	
+	e_1 = E[1+bs*0];
+	e_2 = E[2+bs*0];
+	e_3 = E[3+bs*0];
+	c_10 -= e_1 * c_00;
+	c_20 -= e_2 * c_00;
+	c_30 -= e_3 * c_00;
+	c_11 -= e_1 * c_01;
+	c_21 -= e_2 * c_01;
+	c_31 -= e_3 * c_01;
+	c_12 -= e_1 * c_02;
+	c_22 -= e_2 * c_02;
+	c_32 -= e_3 * c_02;
+	c_13 -= e_1 * c_03;
+	c_23 -= e_2 * c_03;
+	c_33 -= e_3 * c_03;
+
+	if(km==2)
+		goto store;
+	
+	e_2 = E[2+bs*1];
+	e_3 = E[3+bs*1];
+	c_20 -= e_2 * c_10;
+	c_30 -= e_3 * c_10;
+	c_21 -= e_2 * c_11;
+	c_31 -= e_3 * c_11;
+	c_22 -= e_2 * c_12;
+	c_32 -= e_3 * c_12;
+	c_23 -= e_2 * c_13;
+	c_33 -= e_3 * c_13;
+
+	if(km==3)
+		goto store;
+	
+	e_3 = E[3+bs*2];
+	c_30 -= e_3 * c_20;
+	c_31 -= e_3 * c_21;
+	c_32 -= e_3 * c_22;
+	c_33 -= e_3 * c_23;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
+	{
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+	
+	// solve
+
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_10 *= e_00;
+	c_20 *= e_00;
+	c_30 *= e_00;
+
+	if(kn==1)
+		goto store;
+	
+	e_01 = E[0+bs*1];
+	e_11 = inv_diag_E[1];
+	c_01 -= c_00 * e_01;
+	c_11 -= c_10 * e_01;
+	c_21 -= c_20 * e_01;
+	c_31 -= c_30 * e_01;
+	c_01 *= e_11;
+	c_11 *= e_11;
+	c_21 *= e_11;
+	c_31 *= e_11;
+
+	if(kn==2)
+		goto store;
+	
+	e_02 = E[0+bs*2];
+	e_12 = E[1+bs*2];
+	e_22 = inv_diag_E[2];
+	c_02 -= c_00 * e_02;
+	c_12 -= c_10 * e_02;
+	c_22 -= c_20 * e_02;
+	c_32 -= c_30 * e_02;
+	c_02 -= c_01 * e_12;
+	c_12 -= c_11 * e_12;
+	c_22 -= c_21 * e_12;
+	c_32 -= c_31 * e_12;
+	c_02 *= e_22;
+	c_12 *= e_22;
+	c_22 *= e_22;
+	c_32 *= e_22;
+
+	if(kn==3)
+		goto store;
+	
+	e_03 = E[0+bs*3];
+	e_13 = E[1+bs*3];
+	e_23 = E[2+bs*3];
+	e_33 = inv_diag_E[3];
+	c_03 -= c_00 * e_03;
+	c_13 -= c_10 * e_03;
+	c_23 -= c_20 * e_03;
+	c_33 -= c_30 * e_03;
+	c_03 -= c_01 * e_13;
+	c_13 -= c_11 * e_13;
+	c_23 -= c_21 * e_13;
+	c_33 -= c_31 * e_13;
+	c_03 -= c_02 * e_23;
+	c_13 -= c_12 * e_23;
+	c_23 -= c_22 * e_23;
+	c_33 -= c_32 * e_23;
+	c_03 *= e_33;
+	c_13 *= e_33;
+	c_23 *= e_33;
+	c_33 *= e_33;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+//	printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+//	printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+//	printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+//	printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+	
+	// solve
+
+	if(km>3)
+		{
+		e_03 = E[0+bs*3];
+		e_13 = E[1+bs*3];
+		e_23 = E[2+bs*3];
+		e_33 = inv_diag_E[3];
+		c_30 *= e_33;
+		c_31 *= e_33;
+		c_32 *= e_33;
+		c_33 *= e_33;
+		c_00 -= e_03 * c_30;
+		c_01 -= e_03 * c_31;
+		c_02 -= e_03 * c_32;
+		c_03 -= e_03 * c_33;
+		c_10 -= e_13 * c_30;
+		c_11 -= e_13 * c_31;
+		c_12 -= e_13 * c_32;
+		c_13 -= e_13 * c_33;
+		c_20 -= e_23 * c_30;
+		c_21 -= e_23 * c_31;
+		c_22 -= e_23 * c_32;
+		c_23 -= e_23 * c_33;
+		}
+	
+	if(km>2)
+		{
+		e_02 = E[0+bs*2];
+		e_12 = E[1+bs*2];
+		e_22 = inv_diag_E[2];
+		c_20 *= e_22;
+		c_21 *= e_22;
+		c_22 *= e_22;
+		c_23 *= e_22;
+		c_00 -= e_02 * c_20;
+		c_01 -= e_02 * c_21;
+		c_02 -= e_02 * c_22;
+		c_03 -= e_02 * c_23;
+		c_10 -= e_12 * c_20;
+		c_11 -= e_12 * c_21;
+		c_12 -= e_12 * c_22;
+		c_13 -= e_12 * c_23;
+		}
+	
+	if(km>1)
+		{
+		e_01 = E[0+bs*1];
+		e_11 = inv_diag_E[1];
+		c_10 *= e_11;
+		c_11 *= e_11;
+		c_12 *= e_11;
+		c_13 *= e_11;
+		c_00 -= e_01 * c_10;
+		c_01 -= e_01 * c_11;
+		c_02 -= e_01 * c_12;
+		c_03 -= e_01 * c_13;
+		}
+	
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_01 *= e_00;
+	c_02 *= e_00;
+	c_03 *= e_00;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
diff --git a/kernel/c99/kernel_dgemm_diag_lib4.c b/kernel/c99/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..cad2b21
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,1111 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_0;
+		c_2 = a_2 * b_0;
+		c_3 = a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = a_0 * b_1;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_1;
+		c_3 = a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = a_0 * b_2;
+		c_1 = a_1 * b_2;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = a_0 * b_3;
+		c_1 = a_1 * b_3;
+		c_2 = a_2 * b_3;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1,
+		b_0, b_1,
+		c_0, c_1;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0,
+		b_0,
+		c_0;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+		
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		b_0 = B[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+		D[0+bs*1] = c_0;
+		
+
+		b_0 = B[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+		D[0+bs*2] = c_0;
+		
+
+		b_0 = B[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+		D[0+bs*3] = c_0;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+		
+	}
+#endif
+
+
diff --git a/kernel/c99/kernel_dgemv_4_lib4.c b/kernel/c99/kernel_dgemv_4_lib4.c
new file mode 100644
index 0000000..9f11b5f
--- /dev/null
+++ b/kernel/c99/kernel_dgemv_4_lib4.c
@@ -0,0 +1,1009 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		x_0,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		x_0 = x[1];
+
+		y_0 += A[0+bs*1] * x_0;
+		y_1 += A[1+bs*1] * x_0;
+		y_2 += A[2+bs*1] * x_0;
+		y_3 += A[3+bs*1] * x_0;
+		
+		x_0 = x[2];
+
+		y_0 += A[0+bs*2] * x_0;
+		y_1 += A[1+bs*2] * x_0;
+		y_2 += A[2+bs*2] * x_0;
+		y_3 += A[3+bs*2] * x_0;
+		
+		x_0 = x[3];
+
+		y_0 += A[0+bs*3] * x_0;
+		y_1 += A[1+bs*3] * x_0;
+		y_2 += A[2+bs*3] * x_0;
+		y_3 += A[3+bs*3] * x_0;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(k0<=0 & k1>3)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		if(k0<=0 & k1>0) z[0] = y_0;
+		if(k0<=1 & k1>1) z[1] = y_1;
+		if(k0<=2 & k1>2) z[2] = y_2;
+		if(k0<=3 & k1>3) z[3] = y_3;
+		}
+
+	}
+#endif
+	
+	
+	
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_vs_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1)
+	{
+
+	kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z)
+	{
+
+	kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int k, kend;
+	
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	if(offA!=0) // 1, 2, 3
+		{
+		kend = 4-offA<kmax ? 4-offA : kmax;
+		for(; k<kend; k++)
+			{
+			
+			x_0 = x[0];
+		
+			y_0 += A[0+bs*0] * x_0;
+			y_1 += A[0+bs*1] * x_0;
+			y_2 += A[0+bs*2] * x_0;
+			y_3 += A[0+bs*3] * x_0;
+		
+			A += 1;
+			x += 1;
+			
+			}
+		A += bs*(sda-1);
+		}
+	for(; k<kmax-bs+1; k+=bs)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+	
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+	
+		A += 1;
+		x += 1;
+		
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z)
+	{
+
+	kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_vs_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1)
+	{
+
+	kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_vs_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[1+bs*0] * x_0;
+		y_2 -= A[2+bs*0] * x_0;
+		y_3 -= A[3+bs*0] * x_0;
+
+		y_0 -= A[0+bs*1] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[2+bs*1] * x_1;
+		y_3 -= A[3+bs*1] * x_1;
+
+		y_0 -= A[0+bs*2] * x_2;
+		y_1 -= A[1+bs*2] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[3+bs*2] * x_2;
+
+		y_0 -= A[0+bs*3] * x_3;
+		y_1 -= A[1+bs*3] * x_3;
+		y_2 -= A[2+bs*3] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	double
+		a_00, a_10, a_20, a_30,
+		a_11, a_21, a_31;
+	
+	// a_00
+	a_00 = inv_diag_A[0];
+	a_10 = A[1+bs*0];
+	a_20 = A[2+bs*0];
+	a_30 = A[3+bs*0];
+	y_0 *= a_00;
+	z[0] = y_0;
+	y_1 -= a_10 * y_0;
+	y_2 -= a_20 * y_0;
+	y_3 -= a_30 * y_0;
+
+	if(kn==1)
+		{
+		if(km==1)
+			return;
+		y[1] = y_1;
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_11
+	a_11 = inv_diag_A[1];
+	a_21 = A[2+bs*1];
+	a_31 = A[3+bs*1];
+	y_1 *= a_11;	
+	z[1] = y_1;
+	y_2 -= a_21 * y_1;
+	y_3 -= a_31 * y_1;
+
+	if(kn==2)
+		{
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_22
+	a_00 = inv_diag_A[2];
+	a_10 = A[3+bs*2];
+	y_2 *= a_00;
+	z[2] = y_2;
+	y_3 -= a_10 * y_2;
+
+	if(kn==3)
+		{
+		if(km==3)
+			return;
+		y[3] = y_3;
+
+		return;
+		}
+
+	// a_33
+	a_11 = inv_diag_A[3];
+	y_3 *= a_11;	
+	z[3] = y_3;
+
+	}
+#endif
+	
+
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	kernel_dtrsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+	}
+#endif
+	
+	
+		
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_4_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=4;
+	A += 4 + (sda-1)*bs;
+	x += 4;
+	for(; k<kmax-3; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[1+bs*2] * x_1;
+		y_3 -= A[1+bs*3] * x_1;
+		
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[2+bs*3] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+	
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_3 *= inv_diag_A[3];
+	z[3] = y_3;
+
+	y_2 -= A[3+bs*2] * y_3;
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+	y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_3_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0;
+	
+	k = 3;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_3 = x[3];
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+			y_2 -= A[0+bs*2] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			y_2 -= A[1+bs*2] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+			y_2 -= A[2+bs*2] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			y_2 -= A[3+bs*2] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 3;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2;
+	y_1 -= A[2+bs*1]*y_2;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_2_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0;
+	
+	k = 2;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 2;
+		x += 2;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_1_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0;
+	
+	k = 1;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_0 -= A[2+bs*0] * x_2;
+		y_0 -= A[3+bs*0] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_0 -= A[1+bs*0] * x_1;
+			y_0 -= A[2+bs*0] * x_2;
+			y_0 -= A[3+bs*0] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 1;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_un_4_lib4(int kmax, double *A, double *x, double *z)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+
+	y_0 += A[0+bs*0] * x_0;
+/*	y_1 += A[1+bs*0] * x_0;*/
+/*	y_2 += A[2+bs*0] * x_0;*/
+/*	y_3 += A[3+bs*0] * x_0;*/
+
+	y_0 += A[0+bs*1] * x_1;
+	y_1 += A[1+bs*1] * x_1;
+/*	y_2 += A[2+bs*1] * x_1;*/
+/*	y_3 += A[3+bs*1] * x_1;*/
+
+	y_0 += A[0+bs*2] * x_2;
+	y_1 += A[1+bs*2] * x_2;
+	y_2 += A[2+bs*2] * x_2;
+/*	y_3 += A[3+bs*2] * x_2;*/
+
+	y_0 += A[0+bs*3] * x_3;
+	y_1 += A[1+bs*3] * x_3;
+	y_2 += A[2+bs*3] * x_3;
+	y_3 += A[3+bs*3] * x_3;
+	
+	A += 4*bs;
+	x += 4;
+
+	k=4;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+
+		y_0 += A[0+bs*1] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[2+bs*1] * x_1;
+		y_3 += A[3+bs*1] * x_1;
+
+		y_0 += A[0+bs*2] * x_2;
+		y_1 += A[1+bs*2] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[3+bs*2] * x_2;
+
+		y_0 += A[0+bs*3] * x_3;
+		y_1 += A[1+bs*3] * x_3;
+		y_2 += A[2+bs*3] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	z[0] = y_0;
+	z[1] = y_1;
+	z[2] = y_2;
+	z[3] = y_3;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_vs_lib4(int kmax, double *A, int sda, double *x, double *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int
+		k;
+	
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-4; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+	
+	y_0 += A[0+bs*0] * x_0;
+	y_1 += A[0+bs*1] * x_0;
+	y_2 += A[0+bs*2] * x_0;
+	y_3 += A[0+bs*3] * x_0;
+
+/*	y_0 += A[1+bs*0] * x_1;*/
+	y_1 += A[1+bs*1] * x_1;
+	y_2 += A[1+bs*2] * x_1;
+	y_3 += A[1+bs*3] * x_1;
+	
+/*	y_0 += A[2+bs*0] * x_2;*/
+/*	y_1 += A[2+bs*1] * x_2;*/
+	y_2 += A[2+bs*2] * x_2;
+	y_3 += A[2+bs*3] * x_2;
+
+/*	y_0 += A[3+bs*0] * x_3;*/
+/*	y_1 += A[3+bs*1] * x_3;*/
+/*	y_2 += A[3+bs*2] * x_3;*/
+	y_3 += A[3+bs*3] * x_3;
+	
+//	A += sda*bs;
+//	x += 4;
+
+	// store_vs
+	store:
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_lib4(int kmax, double *A, int sda, double *x, double *z)
+	{
+	
+	kernel_dtrmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+	}
+#endif
+
+
+
+
+
diff --git a/kernel/c99/kernel_dgeqrf_4_lib4.c b/kernel/c99/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..071ec86
--- /dev/null
+++ b/kernel/c99/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2620 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	ii = 1;
+	if(m>1)
+		{
+		tmp = pD[1+ps*0];
+		beta += tmp*tmp;
+		if(m>2)
+			{
+			tmp = pD[2+ps*0];
+			beta += tmp*tmp;
+			if(m>3)
+				{
+				tmp = pD[3+ps*0];
+				beta += tmp*tmp;
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		ii = 1;
+		if(m>1)
+			{
+			pD[1+ps*0] *= tmp;
+			if(m>2)
+				{
+				pD[2+ps*0] *= tmp;
+				if(m>3)
+					{
+					pD[3+ps*0] *= tmp;
+					}
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*0] *= tmp;
+			pD[1+ii*sdd+ps*0] *= tmp;
+			pD[2+ii*sdd+ps*0] *= tmp;
+			pD[3+ii*sdd+ps*0] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*0] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[0+ps*1];
+	w2 = pD[0+ps*2];
+	w3 = pD[0+ps*3];
+	if(m>1)
+		{
+		w1 += pD[1+ps*1] * pD[1+ps*0];
+		w2 += pD[1+ps*2] * pD[1+ps*0];
+		w3 += pD[1+ps*3] * pD[1+ps*0];
+		if(m>2)
+			{
+			w1 += pD[2+ps*1] * pD[2+ps*0];
+			w2 += pD[2+ps*2] * pD[2+ps*0];
+			w3 += pD[2+ps*3] * pD[2+ps*0];
+			if(m>3)
+				{
+				w1 += pD[3+ps*1] * pD[3+ps*0];
+				w2 += pD[3+ps*2] * pD[3+ps*0];
+				w3 += pD[3+ps*3] * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[0+ps*1] += w1;
+	pD[0+ps*2] += w2;
+	pD[0+ps*3] += w3;
+	if(m>1)
+		{
+		pD[1+ps*1] += w1 * pD[1+ps*0];
+		pD[1+ps*2] += w2 * pD[1+ps*0];
+		pD[1+ps*3] += w3 * pD[1+ps*0];
+		if(m>2)
+			{
+			pD[2+ps*1] += w1 * pD[2+ps*0];
+			pD[2+ps*2] += w2 * pD[2+ps*0];
+			pD[2+ps*3] += w3 * pD[2+ps*0];
+			if(m>3)
+				{
+				pD[3+ps*1] += w1 * pD[3+ps*0];
+				pD[3+ps*2] += w2 * pD[3+ps*0];
+				pD[3+ps*3] += w3 * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+		}
+	if(m==1)
+		return;
+	// second column
+	beta = 0.0;
+	if(m>2)
+		{
+		tmp = pD[2+ps*1];
+		beta += tmp*tmp;
+		if(m>3)
+			{
+			tmp = pD[3+ps*1];
+			beta += tmp*tmp;
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		if(m>2)
+			{
+			pD[2+ps*1] *= tmp;
+			if(m>3)
+				{
+				pD[3+ps*1] *= tmp;
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*1] *= tmp;
+			pD[1+ii*sdd+ps*1] *= tmp;
+			pD[2+ii*sdd+ps*1] *= tmp;
+			pD[3+ii*sdd+ps*1] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*1] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[1+ps*2];
+	w3 = pD[1+ps*3];
+	if(m>2)
+		{
+		w2 += pD[2+ps*2] * pD[2+ps*1];
+		w3 += pD[2+ps*3] * pD[2+ps*1];
+		if(m>3)
+			{
+			w2 += pD[3+ps*2] * pD[3+ps*1];
+			w3 += pD[3+ps*3] * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[1+ps*2] += w2;
+	pD[1+ps*3] += w3;
+	if(m>2)
+		{
+		pD[2+ps*2] += w2 * pD[2+ps*1];
+		pD[2+ps*3] += w3 * pD[2+ps*1];
+		if(m>3)
+			{
+			pD[3+ps*2] += w2 * pD[3+ps*1];
+			pD[3+ps*3] += w3 * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+		}
+	if(m==2)
+		return;
+	// third column
+	beta = 0.0;
+	if(m>3)
+		{
+		tmp = pD[3+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		if(m>3)
+			{
+			pD[3+ps*2] *= tmp;
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*2] *= tmp;
+			pD[1+ii*sdd+ps*2] *= tmp;
+			pD[2+ii*sdd+ps*2] *= tmp;
+			pD[3+ii*sdd+ps*2] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*2] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[2+ps*3];
+	if(m>3)
+		{
+		w3 += pD[3+ps*3] * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	w3 = - dD[2] * w3;
+	pD[2+ps*3] += w3;
+	if(m>3)
+		{
+		pD[3+ps*3] += w3 * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+		}
+	if(m==3)
+		return;
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*3] *= tmp;
+			pD[1+ii*sdd+ps*3] *= tmp;
+			pD[2+ii*sdd+ps*3] *= tmp;
+			pD[3+ii*sdd+ps*3] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*3] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k; //m<n ? m : n;
+	double alpha, beta, tmp, w0;
+	double *pC00, *pC10, *pC01, *pC11;
+	int offset;
+	double *pD0 = pD-offD;
+	for(ii=0; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		jmax = m-ii-1;
+		jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		offset = 0;
+		jj = 0;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				tmp = pC10[0+offset];
+				beta += tmp*tmp;
+				offset += 1;
+				}
+			offset += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			tmp = pC10[1+offset];
+			beta += tmp*tmp;
+			tmp = pC10[2+offset];
+			beta += tmp*tmp;
+			tmp = pC10[3+offset];
+			beta += tmp*tmp;
+			offset += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			offset += 1;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			offset = 0;
+			jj = 0;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					pC10[0+offset] *= tmp;
+					offset += 1;
+					}
+				offset += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				pC10[0+offset] *= tmp;
+				pC10[1+offset] *= tmp;
+				pC10[2+offset] *= tmp;
+				pC10[3+offset] *= tmp;
+				offset += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				pC10[0+offset] *= tmp;
+				offset += 1;
+				}
+			pC00[0] = beta;
+			}
+		if(ii<n)
+			{
+			pC01 = pC00 + ps;
+			pC11 = pC10 + ps;
+			kmax = jmax;
+			kmax0 = jmax0;
+			jmax = n-ii-1;
+			jj = 0;
+			for( ; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ps*jj] * 1.0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+						offset += 1;
+						}
+					offset += -ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+					w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+					w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					offset += 1;
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ps*jj] += w0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+						offset += 1;
+						}
+					offset = offset-ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+					pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+					pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					offset += 1;
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 2;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[1+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pD[1+ps*0];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] = tmp;
+			tmp = pC[1+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pD[2+ps*0];
+				d1 = pD[2+ps*1];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * d0;
+				pW[0+ldw*1] += tmp * d1;
+				pW[0+ldw*2] = tmp;
+				tmp = pC[2+ps*1];
+				pW[1+ldw*0] += tmp * d0;
+				pW[1+ldw*1] += tmp * d1;
+				pW[1+ldw*2] = tmp;
+				if(m>3)
+					{
+					d0 = pD[3+ps*0];
+					d1 = pD[3+ps*1];
+					d2 = pD[3+ps*2];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * d0;
+					pW[0+ldw*1] += tmp * d1;
+					pW[0+ldw*2] += tmp * d2;
+					pW[0+ldw*3] = tmp;
+					tmp = pC[3+ps*1];
+					pW[1+ldw*0] += tmp * d0;
+					pW[1+ldw*1] += tmp * d1;
+					pW[1+ldw*2] += tmp * d2;
+					pW[1+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		pC[0+ps*1] -= pW[1+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+ps*0];
+			pW[0+ldw*1] = tmp;
+			if(m>2)
+				{
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * pD[2+ps*0];
+				pW[0+ldw*1] += tmp * pD[2+ps*1];
+				pW[0+ldw*2] = tmp;
+				if(m>3)
+					{
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * pD[3+ps*0];
+					pW[0+ldw*1] += tmp * pD[3+ps*1];
+					pW[0+ldw*2] += tmp * pD[3+ps*2];
+					pW[0+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		}
+
+	return;
+	}
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double c00, c01,
+	       c10, c11,
+	       c20, c21,
+	       c30, c31;
+	double a0, a1, a2, a3, b0, b1;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 4;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[0+ldw*1] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] = tmp;
+			tmp = pC[1+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += d0 * tmp;
+				pW[1+ldw*0] += d1 * tmp;
+				pW[2+ldw*0] = tmp;
+				tmp = pC[2+ps*1];
+				pW[0+ldw*1] += d0 * tmp;
+				pW[1+ldw*1] += d1 * tmp;
+				pW[2+ldw*1] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += d0 * tmp;
+					pW[1+ldw*0] += d1 * tmp;
+					pW[2+ldw*0] += d2 * tmp;
+					pW[3+ldw*0] = tmp;
+					tmp = pC[3+ps*1];
+					pW[0+ldw*1] += d0 * tmp;
+					pW[1+ldw*1] += d1 * tmp;
+					pW[2+ldw*1] += d2 * tmp;
+					pW[3+ldw*1] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			}
+		// compute W^T *= T
+		pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+		pW[3+ldw*1] = pT[3+ldt*0]*pW[0+ldw*1] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[2+ldw*1] + pT[3+ldt*3]*pW[3+ldw*1];
+		pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+		pW[2+ldw*1] = pT[2+ldt*0]*pW[0+ldw*1] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[2+ldw*1];
+		pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[0+ldw*1] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[0+ldw*1] = pT[0+ldt*0]*pW[0+ldw*1];
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		c01 = pC[0+jj*sdc+ps*1];
+		c11 = pC[1+jj*sdc+ps*1];
+		c21 = pC[2+jj*sdc+ps*1];
+		c31 = pC[3+jj*sdc+ps*1];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ldw*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ldw*1];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ldw*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ldw*1];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ldw*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ldw*1];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ldw*0];
+		c30 -= b0;
+		b1 = pW[3+ldw*1];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		pC[0+jj*sdc+ps*1] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[1+jj*sdc+ps*1] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				pC[2+jj*sdc+ps*1] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					pC[3+jj*sdc+ps*1] = c31;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			c01 = pC[0+jj*sdc+ps*1];
+			c11 = pC[1+jj*sdc+ps*1];
+			c21 = pC[2+jj*sdc+ps*1];
+			c31 = pC[3+jj*sdc+ps*1];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[0+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[1+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[2+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[3+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			pC[0+jj*sdc+ps*1] = c01;
+			pC[1+jj*sdc+ps*1] = c11;
+			pC[2+jj*sdc+ps*1] = c21;
+			pC[3+jj*sdc+ps*1] = c31;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			c01 = pC[ll+jj*sdc+ps*1];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[0+ldw*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[1+ldw*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[2+ldw*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[3+ldw*1];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			pC[ll+jj*sdc+ps*1] = c01;
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += d0 * tmp;
+				pW[1+ldw*0] += d1 * tmp;
+				pW[2+ldw*0] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += d0 * tmp;
+					pW[1+ldw*0] += d1 * tmp;
+					pW[2+ldw*0] += d2 * tmp;
+					pW[3+ldw*0] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			}
+		// compute W^T *= T
+		pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+		pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+		pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ldw*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ldw*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ldw*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ldw*0];
+		c30 -= b0;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			}
+		}
+
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		for(ii=1; ii<n; ii++)
+			{
+			pD[0+ps*ii] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		}
+	// second column
+	beta = 0.0;
+	for(ii=2; ii<n; ii++)
+		{
+		tmp = pD[1+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		for(ii=2; ii<n; ii++)
+			{
+			pD[1+ps*ii] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		}
+	// third column
+	beta = 0.0;
+	for(ii=3; ii<n; ii++)
+		{
+		tmp = pD[2+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		for(ii=3; ii<n; ii++)
+			{
+			pD[2+ps*ii] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[3+ps*2];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	w3 = - dD[2] * w3;
+	pD[3+ps*2] += w3;
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		}
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		tmp = pD[3+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<n; ii++)
+			{
+			pD[3+ps*ii] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp;
+	double w00, w01,
+		   w10, w11,
+		   w20, w21,
+		   w30, w31;
+	double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+	double pT[4];
+	int ldt = 2;
+	double *pD0 = pD-offD;
+	ii = 0;
+#if 1
+	for(; ii<imax-1; ii+=2)
+		{
+		// first row
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		kmax = n-ii;
+		w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+			}
+		w00 = - w00*dD[ii];
+		pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+			}
+		// second row
+		pC11 = pC10+ps*1;
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ps*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC11[0+ps*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ps*jj] *= tmp;
+			}
+		// compute T
+		kmax = n-ii;
+		tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+		for(kk=2; kk<kmax; kk++)
+			tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+		pT[0+ldt*0] = dD[ii+0];
+		pT[0+ldt*1] = - dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldt*1] = dD[ii+1];
+		// downgrade
+		kmax = n-ii;
+		jmax = m-ii-2;
+		jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		jj = 0;
+		pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+		pC20 = pC20a;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+				w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+					w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+					}
+				w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+				w00 = - w00*pT[0+ldt*0];
+				pC20[0+ps*0] += w00*1.0          + w01*0.0;
+				pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+					}
+				pC20 += 1;
+				}
+			pC20 += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w10 = pC20[1+ps*0]*1.0 + pC20[1+ps*1]*pC00[0+ps*1];
+			w20 = pC20[2+ps*0]*1.0 + pC20[2+ps*1]*pC00[0+ps*1];
+			w30 = pC20[3+ps*0]*1.0 + pC20[3+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			w11 = pC20[1+ps*0]*0.0 + pC20[1+ps*1]*1.0;
+			w21 = pC20[2+ps*0]*0.0 + pC20[2+ps*1]*1.0;
+			w31 = pC20[3+ps*0]*0.0 + pC20[3+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w10 += pC20[1+ps*kk]*pC00[0+ps*kk];
+				w20 += pC20[2+ps*kk]*pC00[0+ps*kk];
+				w30 += pC20[3+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				w11 += pC20[1+ps*kk]*pC10[0+ps*kk];
+				w21 += pC20[2+ps*kk]*pC10[0+ps*kk];
+				w31 += pC20[3+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+			w11 = - w10*pT[0+ldt*1] - w11*pT[1+ldt*1];
+			w21 = - w20*pT[0+ldt*1] - w21*pT[1+ldt*1];
+			w31 = - w30*pT[0+ldt*1] - w31*pT[1+ldt*1];
+			w00 = - w00*pT[0+ldt*0];
+			w10 = - w10*pT[0+ldt*0];
+			w20 = - w20*pT[0+ldt*0];
+			w30 = - w30*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[1+ps*0] += w10*1.0          + w11*0.0;
+			pC20[2+ps*0] += w20*1.0          + w21*0.0;
+			pC20[3+ps*0] += w30*1.0          + w31*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			pC20[1+ps*1] += w10*pC00[0+ps*1] + w11*1.0;
+			pC20[2+ps*1] += w20*pC00[0+ps*1] + w21*1.0;
+			pC20[3+ps*1] += w30*pC00[0+ps*1] + w31*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				pC20[1+ps*kk] += w10*pC00[0+ps*kk] + w11*pC10[0+ps*kk];
+				pC20[2+ps*kk] += w20*pC00[0+ps*kk] + w21*pC10[0+ps*kk];
+				pC20[3+ps*kk] += w30*pC00[0+ps*kk] + w31*pC10[0+ps*kk];
+				}
+			pC20 += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+			w00 = - w00*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				}
+			pC20 += 1;
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		if(ii<n)
+			{
+			kmax = n-ii;
+			jmax = m-ii-1;
+			jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+			jmax0 = jmax<jmax0 ? jmax : jmax0;
+			jj = 0;
+			pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+			pC10 = pC10a;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					w00 = pC10[0+ps*0];
+					for(kk=1; kk<kmax; kk++)
+						{
+						w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+						}
+					w00 = - w00*dD[ii];
+					pC10[0+ps*0] += w00;
+					for(kk=1; kk<kmax; kk++)
+						{
+						pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+						}
+					pC10 += 1;
+					}
+				pC10 += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				w00 = pC10[0+ps*0];
+				w10 = pC10[1+ps*0];
+				w20 = pC10[2+ps*0];
+				w30 = pC10[3+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk]*pC00[0+ps*kk];
+					w10 += pC10[1+ps*kk]*pC00[0+ps*kk];
+					w20 += pC10[2+ps*kk]*pC00[0+ps*kk];
+					w30 += pC10[3+ps*kk]*pC00[0+ps*kk];
+					}
+				w00 = - w00*dD[ii];
+				w10 = - w10*dD[ii];
+				w20 = - w20*dD[ii];
+				w30 = - w30*dD[ii];
+				pC10[0+ps*0] += w00;
+				pC10[1+ps*0] += w10;
+				pC10[2+ps*0] += w20;
+				pC10[3+ps*0] += w30;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00*pC00[0+ps*kk];
+					pC10[1+ps*kk] += w10*pC00[0+ps*kk];
+					pC10[2+ps*kk] += w20*pC00[0+ps*kk];
+					pC10[3+ps*kk] += w30*pC00[0+ps*kk];
+					}
+				pC10 += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				w00 = pC10[0+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+					}
+				w00 = - w00*dD[ii];
+				pC10[0+ps*0] += w00;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+					}
+				pC10 += 1;
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+	{
+	const int ps = 4;
+	int kk;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	// 0
+	// 1
+	v10 =  pD[0+ps*1];
+	// 2
+	v10 += pD[1+ps*2]*pD[0+ps*2];
+	v20 =  pD[0+ps*2];
+	v21 =  pD[1+ps*2];
+	// 3
+	v10 += pD[1+ps*3]*pD[0+ps*3];
+	v20 += pD[2+ps*3]*pD[0+ps*3];
+	v21 += pD[2+ps*3]*pD[1+ps*3];
+	v30 =  pD[0+ps*3];
+	v31 =  pD[1+ps*3];
+	v32 =  pD[2+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		v10 += pD[1+ps*kk]*pD[0+ps*kk];
+		v20 += pD[2+ps*kk]*pD[0+ps*kk];
+		v30 += pD[3+ps*kk]*pD[0+ps*kk];
+		v21 += pD[2+ps*kk]*pD[1+ps*kk];
+		v31 += pD[3+ps*kk]*pD[1+ps*kk];
+		v32 += pD[3+ps*kk]*pD[2+ps*kk];
+		}
+	pT[0+ps*0] = - dD[0];
+	pT[1+ps*1] = - dD[1];
+	pT[2+ps*2] = - dD[2];
+	pT[3+ps*3] = - dD[3];
+	pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+	pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+	pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+	pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+	pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w0, w1, w2, w3;
+	const int ps = 4;
+	// zero tau matrix
+	for(ii=0; ii<16; ii++)
+		pT[ii] = 0.0;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	pT[0+ps*0] = - dD[0];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	pT[1+ps*1] = - dD[1];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w0 = pD[0+ps*1]; //
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w0 += pD[0+ps*2] * pD[1+ps*2]; //
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[1+ps*3]; //
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	pT[2+ps*2] = - dD[2];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w0 = pD[0+ps*2];
+	w1 = pD[1+ps*2];
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[2+ps*3];
+	w1 += pD[1+ps*3] * pD[2+ps*3];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[2+ps*ii];
+		w1 += pD[1+ps*ii] * pD[2+ps*ii];
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+	pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	pT[3+ps*3] = - dD[3];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	w0 =  pD[0+ps*3];
+	w1 =  pD[1+ps*3];
+	w2 =  pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[3+ps*ii];
+		w1 += pD[1+ps*ii] * pD[3+ps*ii];
+		w2 += pD[2+ps*ii] * pD[3+ps*ii];
+		}
+	//
+	pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+	pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+	return;
+	}
+
+
+
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	pW[1+ps*0] = pD[1+ps*0];
+	pW[2+ps*0] = pD[2+ps*0];
+	pW[3+ps*0] = pD[3+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[1+ps*0] += pD[1+ps*1]*pV[0+ps*1];
+	pW[2+ps*0] += pD[2+ps*1]*pV[0+ps*1];
+	pW[3+ps*0] += pD[3+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	pW[1+ps*1] = pD[1+ps*1];
+	pW[2+ps*1] = pD[2+ps*1];
+	pW[3+ps*1] = pD[3+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[1+ps*0] += pD[1+ps*2]*pV[0+ps*2];
+	pW[2+ps*0] += pD[2+ps*2]*pV[0+ps*2];
+	pW[3+ps*0] += pD[3+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[1+ps*1] += pD[1+ps*2]*pV[1+ps*2];
+	pW[2+ps*1] += pD[2+ps*2]*pV[1+ps*2];
+	pW[3+ps*1] += pD[3+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	pW[1+ps*2] = pD[1+ps*2];
+	pW[2+ps*2] = pD[2+ps*2];
+	pW[3+ps*2] = pD[3+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[1+ps*0] += pD[1+ps*3]*pV[0+ps*3];
+	pW[2+ps*0] += pD[2+ps*3]*pV[0+ps*3];
+	pW[3+ps*0] += pD[3+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[1+ps*1] += pD[1+ps*3]*pV[1+ps*3];
+	pW[2+ps*1] += pD[2+ps*3]*pV[1+ps*3];
+	pW[3+ps*1] += pD[3+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[1+ps*2] += pD[1+ps*3]*pV[2+ps*3];
+	pW[2+ps*2] += pD[2+ps*3]*pV[2+ps*3];
+	pW[3+ps*2] += pD[3+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	pW[1+ps*3] = pD[1+ps*3];
+	pW[2+ps*3] = pD[2+ps*3];
+	pW[3+ps*3] = pD[3+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[1+ps*0] += pD[1+ps*kk]*pV[0+ps*kk];
+		pW[2+ps*0] += pD[2+ps*kk]*pV[0+ps*kk];
+		pW[3+ps*0] += pD[3+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[1+ps*1] += pD[1+ps*kk]*pV[1+ps*kk];
+		pW[2+ps*1] += pD[2+ps*kk]*pV[1+ps*kk];
+		pW[3+ps*1] += pD[3+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[1+ps*2] += pD[1+ps*kk]*pV[2+ps*kk];
+		pW[2+ps*2] += pD[2+ps*kk]*pV[2+ps*kk];
+		pW[3+ps*2] += pD[3+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		pW[1+ps*3] += pD[1+ps*kk]*pV[3+ps*kk];
+		pW[2+ps*3] += pD[2+ps*kk]*pV[3+ps*kk];
+		pW[3+ps*3] += pD[3+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	pW[1+ps*3] = pW[1+ps*0]*pT[0+ps*3] + pW[1+ps*1]*pT[1+ps*3] + pW[1+ps*2]*pT[2+ps*3] + pW[1+ps*3]*pT[3+ps*3];
+	pW[2+ps*3] = pW[2+ps*0]*pT[0+ps*3] + pW[2+ps*1]*pT[1+ps*3] + pW[2+ps*2]*pT[2+ps*3] + pW[2+ps*3]*pT[3+ps*3];
+	pW[3+ps*3] = pW[3+ps*0]*pT[0+ps*3] + pW[3+ps*1]*pT[1+ps*3] + pW[3+ps*2]*pT[2+ps*3] + pW[3+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	pW[1+ps*2] = pW[1+ps*0]*pT[0+ps*2] + pW[1+ps*1]*pT[1+ps*2] + pW[1+ps*2]*pT[2+ps*2];
+	pW[2+ps*2] = pW[2+ps*0]*pT[0+ps*2] + pW[2+ps*1]*pT[1+ps*2] + pW[2+ps*2]*pT[2+ps*2];
+	pW[3+ps*2] = pW[3+ps*0]*pT[0+ps*2] + pW[3+ps*1]*pT[1+ps*2] + pW[3+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	pW[1+ps*1] = pW[1+ps*0]*pT[0+ps*1] + pW[1+ps*1]*pT[1+ps*1];
+	pW[2+ps*1] = pW[2+ps*0]*pT[0+ps*1] + pW[2+ps*1]*pT[1+ps*1];
+	pW[3+ps*1] = pW[3+ps*0]*pT[0+ps*1] + pW[3+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	pW[1+ps*0] = pW[1+ps*0]*pT[0+ps*0];
+	pW[2+ps*0] = pW[2+ps*0]*pT[0+ps*0];
+	pW[3+ps*0] = pW[3+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	pD[1+ps*0] += pW[1+ps*0];
+	pD[2+ps*0] += pW[2+ps*0];
+	pD[3+ps*0] += pW[3+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	pD[1+ps*1] += pW[1+ps*0]*pV[0+ps*1] + pW[1+ps*1];
+	pD[2+ps*1] += pW[2+ps*0]*pV[0+ps*1] + pW[2+ps*1];
+	pD[3+ps*1] += pW[3+ps*0]*pV[0+ps*1] + pW[3+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	pD[1+ps*2] += pW[1+ps*0]*pV[0+ps*2] + pW[1+ps*1]*pV[1+ps*2] + pW[1+ps*2];
+	pD[2+ps*2] += pW[2+ps*0]*pV[0+ps*2] + pW[2+ps*1]*pV[1+ps*2] + pW[2+ps*2];
+	pD[3+ps*2] += pW[3+ps*0]*pV[0+ps*2] + pW[3+ps*1]*pV[1+ps*2] + pW[3+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	pD[1+ps*3] += pW[1+ps*0]*pV[0+ps*3] + pW[1+ps*1]*pV[1+ps*3] + pW[1+ps*2]*pV[2+ps*3] + pW[1+ps*3];
+	pD[2+ps*3] += pW[2+ps*0]*pV[0+ps*3] + pW[2+ps*1]*pV[1+ps*3] + pW[2+ps*2]*pV[2+ps*3] + pW[2+ps*3];
+	pD[3+ps*3] += pW[3+ps*0]*pV[0+ps*3] + pW[3+ps*1]*pV[1+ps*3] + pW[3+ps*2]*pV[2+ps*3] + pW[3+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		pD[1+ps*kk] += pW[1+ps*0]*pV[0+ps*kk] + pW[1+ps*1]*pV[1+ps*kk] + pW[1+ps*2]*pV[2+ps*kk] + pW[1+ps*3]*pV[3+ps*kk];
+		pD[2+ps*kk] += pW[2+ps*0]*pV[0+ps*kk] + pW[2+ps*1]*pV[1+ps*kk] + pW[2+ps*2]*pV[2+ps*kk] + pW[2+ps*3]*pV[3+ps*kk];
+		pD[3+ps*kk] += pW[3+ps*0]*pV[0+ps*kk] + pW[3+ps*1]*pV[1+ps*kk] + pW[3+ps*2]*pV[2+ps*kk] + pW[3+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
diff --git a/kernel/c99/kernel_dgetrf_pivot_4_lib4.c b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..787322e
--- /dev/null
+++ b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,779 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numbering, starting from 0
+void didamax_lib4(int n, int offset, double *pA, int sda, int *p_idamax, double *p_amax)
+	{
+
+	int idamax, ii;
+	double tmp, amax;
+		
+	p_idamax[0] = -1;
+	if(n<1)
+		return;
+
+	const int bs = 4;
+
+	int na = (bs - offset%bs)%bs;
+	na = n<na ? n : na;
+
+	amax = -1.0;
+	ii = 0;
+	if(na>0)
+		{
+		for( ; ii<na; ii++)
+			{
+			tmp = fabs(pA[0]);
+			if(tmp>amax)
+				{
+				idamax = ii+0;
+				amax = tmp;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		tmp = fabs(pA[1]);
+		if(tmp>amax)
+			{
+			idamax = ii+1;
+			amax = tmp;
+			}
+		tmp = fabs(pA[2]);
+		if(tmp>amax)
+			{
+			idamax = ii+2;
+			amax = tmp;
+			}
+		tmp = fabs(pA[3]);
+		if(tmp>amax)
+			{
+			idamax = ii+3;
+			amax = tmp;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<n; ii++)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		pA += 1;
+		}
+	
+	p_amax[0] = amax;
+	p_idamax[0] = idamax;
+
+	return;
+
+	}
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	double
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+	didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		pA[1+bs*0] *= tmp0;
+		pA[2+bs*0] *= tmp0;
+		pA[3+bs*0] *= tmp0;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB[1+bs*0] *= tmp0;
+			pB[2+bs*0] *= tmp0;
+			pB[3+bs*0] *= tmp0;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+
+	// second column
+	u_01  = pA[0+bs*1];
+	tmp1  = pA[1+bs*1];
+	tmp2  = pA[2+bs*1];
+	tmp3  = pA[3+bs*1];
+	tmp1 -= pA[1+bs*0] * u_01;
+	tmp2 -= pA[2+bs*0] * u_01;
+	tmp3 -= pA[3+bs*0] * u_01;
+	pA[1+bs*1] = tmp1;
+	pA[2+bs*1] = tmp2;
+	pA[3+bs*1] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*1];
+		tmp1  = pB[1+bs*1];
+		tmp2  = pB[2+bs*1];
+		tmp3  = pB[3+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		tmp1 -= pB[1+bs*0] * u_01;
+		tmp2 -= pB[2+bs*0] * u_01;
+		tmp3 -= pB[3+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB[1+bs*1] = tmp1;
+		pB[2+bs*1] = tmp2;
+		pB[3+bs*1] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0 = pB[0+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB += 1;
+		}
+
+	didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		pA[2+bs*1] *= tmp1;
+		pA[3+bs*1] *= tmp1;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB[1+bs*1] *= tmp1;
+			pB[2+bs*1] *= tmp1;
+			pB[3+bs*1] *= tmp1;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	// third column
+	u_02  = pA[0+bs*2];
+	u_12  = pA[1+bs*2];
+	u_12 -= pA[1+bs*0] * u_02;
+	pA[1+bs*2] = u_12;
+	tmp2  = pA[2+bs*2];
+	tmp3  = pA[3+bs*2];
+	tmp2 -= pA[2+bs*0] * u_02;
+	tmp3 -= pA[3+bs*0] * u_02;
+	tmp2 -= pA[2+bs*1] * u_12;
+	tmp3 -= pA[3+bs*1] * u_12;
+	pA[2+bs*2] = tmp2;
+	pA[3+bs*2] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp1  = pB[1+bs*2];
+		tmp2  = pB[2+bs*2];
+		tmp3  = pB[3+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp1 -= pB[1+bs*0] * u_02;
+		tmp2 -= pB[2+bs*0] * u_02;
+		tmp3 -= pB[3+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		tmp1 -= pB[1+bs*1] * u_12;
+		tmp2 -= pB[2+bs*1] * u_12;
+		tmp3 -= pB[3+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB[1+bs*2] = tmp1;
+		pB[2+bs*2] = tmp2;
+		pB[3+bs*2] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB += 1;
+		}
+
+	didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+	ipiv[2] = idamax+2;
+	if(tmp2!=0)
+		{
+		if(ipiv[2]!=2)
+			drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		tmp2 = 1.0 / pA[2+bs*2];
+		inv_diag_A[2] = tmp2;
+		pA[3+bs*2] *= tmp2;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB[1+bs*2] *= tmp2;
+			pB[2+bs*2] *= tmp2;
+			pB[3+bs*2] *= tmp2;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[2] = 0.0;
+		}
+
+	// fourth column
+	u_03  = pA[0+bs*3];
+	u_13  = pA[1+bs*3];
+	u_13 -= pA[1+bs*0] * u_03;
+	pA[1+bs*3] = u_13;
+	u_23  = pA[2+bs*3];
+	u_23 -= pA[2+bs*0] * u_03;
+	u_23 -= pA[2+bs*1] * u_13;
+	pA[2+bs*3] = u_23;
+	tmp3  = pA[3+bs*3];
+	tmp3 -= pA[3+bs*0] * u_03;
+	tmp3 -= pA[3+bs*1] * u_13;
+	tmp3 -= pA[3+bs*2] * u_23;
+	pA[3+bs*3] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp1  = pB[1+bs*3];
+		tmp2  = pB[2+bs*3];
+		tmp3  = pB[3+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp1 -= pB[1+bs*0] * u_03;
+		tmp2 -= pB[2+bs*0] * u_03;
+		tmp3 -= pB[3+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp1 -= pB[1+bs*1] * u_13;
+		tmp2 -= pB[2+bs*1] * u_13;
+		tmp3 -= pB[3+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		tmp1 -= pB[1+bs*2] * u_23;
+		tmp2 -= pB[2+bs*2] * u_23;
+		tmp3 -= pB[3+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB[1+bs*3] = tmp1;
+		pB[2+bs*3] = tmp2;
+		pB[3+bs*3] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB += 1;
+		}
+
+	didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+	ipiv[3] = idamax+3;
+	if(tmp3!=0)
+		{
+		if(ipiv[3]!=3)
+			drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		tmp3 = 1.0 / pA[3+bs*3];
+		inv_diag_A[3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB[1+bs*3] *= tmp3;
+			pB[2+bs*3] *= tmp3;
+			pB[3+bs*3] *= tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[3] = 0.0;
+		}
+	
+	return;
+
+	}
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	double
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+
+	// find pivot & scale
+	didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		if(m>=4)
+			{
+			pA[1+bs*0] *= tmp0;
+			pA[2+bs*0] *= tmp0;
+			pA[3+bs*0] *= tmp0;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB[1+bs*0] *= tmp0;
+				pB[2+bs*0] *= tmp0;
+				pB[3+bs*0] *= tmp0;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB += 1;
+				}
+			}
+		else // m = {1,2,3}
+			{
+			if(m>1)
+				{
+				pA[1+bs*0] *= tmp0;
+				if(m>2)
+					pA[2+bs*0] *= tmp0;
+				}
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+		return;
+
+	// second column
+
+	// correct
+	if(m>=4)
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp2  = pA[2+bs*1];
+		tmp3  = pA[3+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		tmp2 -= pA[2+bs*0] * u_01;
+		tmp3 -= pA[3+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		pA[2+bs*1] = tmp2;
+		pA[3+bs*1] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*1];
+			tmp1  = pB[1+bs*1];
+			tmp2  = pB[2+bs*1];
+			tmp3  = pB[3+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			tmp1 -= pB[1+bs*0] * u_01;
+			tmp2 -= pB[2+bs*0] * u_01;
+			tmp3 -= pB[3+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB[1+bs*1] = tmp1;
+			pB[2+bs*1] = tmp2;
+			pB[3+bs*1] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0 = pB[0+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*1];
+			tmp2 -= pA[2+bs*0] * u_01;
+			pA[2+bs*1] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		if(m>=4)
+			{
+			pA[2+bs*1] *= tmp1;
+			pA[3+bs*1] *= tmp1;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB[1+bs*1] *= tmp1;
+				pB[2+bs*1] *= tmp1;
+				pB[3+bs*1] *= tmp1;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB += 1;
+				}
+			}
+		else // m = {2,3}
+			{
+			if(m>2)
+				pA[2+bs*1] *= tmp1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	if(n==2)
+		return;
+
+	// third column
+
+	// correct
+	if(m>=4)
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		tmp2  = pA[2+bs*2];
+		tmp3  = pA[3+bs*2];
+		tmp2 -= pA[2+bs*0] * u_02;
+		tmp3 -= pA[3+bs*0] * u_02;
+		tmp2 -= pA[2+bs*1] * u_12;
+		tmp3 -= pA[3+bs*1] * u_12;
+		pA[2+bs*2] = tmp2;
+		pA[3+bs*2] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp1  = pB[1+bs*2];
+			tmp2  = pB[2+bs*2];
+			tmp3  = pB[3+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp1 -= pB[1+bs*0] * u_02;
+			tmp2 -= pB[2+bs*0] * u_02;
+			tmp3 -= pB[3+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			tmp1 -= pB[1+bs*1] * u_12;
+			tmp2 -= pB[2+bs*1] * u_12;
+			tmp3 -= pB[3+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB[1+bs*2] = tmp1;
+			pB[2+bs*2] = tmp2;
+			pB[3+bs*2] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*2];
+			tmp2 -= pA[2+bs*0] * u_02;
+			tmp2 -= pA[2+bs*1] * u_12;
+			pA[2+bs*2] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	if(m>2)
+		{
+		didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+		ipiv[2] = idamax+2;
+		if(tmp2!=0)
+			{
+			if(ipiv[2]!=2)
+				drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			tmp2 = 1.0 / pA[2+bs*2];
+			inv_diag_A[2] = tmp2;
+			if(m>=4)
+				{
+				pA[3+bs*2] *= tmp2;
+				pB = pA + bs*sda;
+				for(k=0; k<ma-3; k+=4)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB[1+bs*2] *= tmp2;
+					pB[2+bs*2] *= tmp2;
+					pB[3+bs*2] *= tmp2;
+					pB += bs*sda;
+					}
+				for( ; k<ma; k++)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB += 1;
+					}
+				}
+			}
+		else
+			{
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n<4)
+		return;
+
+	// fourth column
+
+	// correct
+	if(m>=4)
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		u_23  = pA[2+bs*3];
+		u_23 -= pA[2+bs*0] * u_03;
+		u_23 -= pA[2+bs*1] * u_13;
+		pA[2+bs*3] = u_23;
+		tmp3  = pA[3+bs*3];
+		tmp3 -= pA[3+bs*0] * u_03;
+		tmp3 -= pA[3+bs*1] * u_13;
+		tmp3 -= pA[3+bs*2] * u_23;
+		pA[3+bs*3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp1  = pB[1+bs*3];
+			tmp2  = pB[2+bs*3];
+			tmp3  = pB[3+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp1 -= pB[1+bs*0] * u_03;
+			tmp2 -= pB[2+bs*0] * u_03;
+			tmp3 -= pB[3+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp1 -= pB[1+bs*1] * u_13;
+			tmp2 -= pB[2+bs*1] * u_13;
+			tmp3 -= pB[3+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			tmp1 -= pB[1+bs*2] * u_23;
+			tmp2 -= pB[2+bs*2] * u_23;
+			tmp3 -= pB[3+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB[1+bs*3] = tmp1;
+			pB[2+bs*3] = tmp2;
+			pB[3+bs*3] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		if(m>2)
+			{
+			u_23  = pA[2+bs*3];
+			u_23 -= pA[2+bs*0] * u_03;
+			u_23 -= pA[2+bs*1] * u_13;
+			pA[2+bs*3] = u_23;
+			}
+		}
+
+	if(m>3)
+		{
+		// find pivot & scale
+		didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+		ipiv[3] = idamax+3;
+		if(tmp3!=0)
+			{
+			if(ipiv[3]!=3)
+				drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			tmp3 = 1.0 / pA[3+bs*3];
+			inv_diag_A[3] = tmp3;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB[1+bs*3] *= tmp3;
+				pB[2+bs*3] *= tmp3;
+				pB[3+bs*3] *= tmp3;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB += 1;
+				}
+			}
+		else
+			{
+			inv_diag_A[3] = 0.0;
+			}
+		}
+	
+	return;
+
+	}
+
+
+	
+
+
diff --git a/kernel/c99/kernel_dsymv_4_lib4.c b/kernel/c99/kernel_dsymv_4_lib4.c
new file mode 100644
index 0000000..bed4300
--- /dev/null
+++ b/kernel/c99/kernel_dsymv_4_lib4.c
@@ -0,0 +1,1024 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha_n[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha_n[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha_n[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha_n[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	// store t
+	z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+	if(km>1)
+		{
+		z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+		if(km>2)
+			{
+			z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+			if(km>3)
+				{
+				z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t)
+	{
+
+	kernel_dgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x_n, double *z_n, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	double *x_t = x_n;
+	double *z_t = z_n;
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	if(offA==0)
+		{
+		if(kmax<4)
+			{
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+			if(kmax==1)
+				goto store_t;
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+			if(kmax==2)
+				goto store_t;
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+			goto store_t;
+			}
+		else
+			{
+
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+
+			// 3
+
+			y_n_0 = z_n[3]; 
+			x_t_0 = x_t[3];
+
+			a_00 = A[3+bs*0];
+			a_01 = A[3+bs*1];
+			a_02 = A[3+bs*2];
+			a_03 = A[3+bs*3];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_n_0 += a_02 * x_n_2;
+			y_t_2 += a_02 * x_t_0;
+			y_t_3 += a_03 * x_t_0;
+
+			z_n[3] = y_n_0;
+
+			k += 4;
+			A += sda*bs;
+			z_n += 4;
+			x_t += 4;
+
+			}
+		}
+	else if(offA==1)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==6)
+			goto store_t;
+
+		// 6
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==7)
+			goto store_t;
+
+		k += 7;
+
+		}
+	else if(offA==2)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==6)
+			goto store_t;
+
+		k += 6;
+
+		}
+	else // if(offA==3)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==5)
+			goto store_t;
+
+		k += 5;
+
+		}
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	store_t:
+	z_t[0] += alpha[0]*y_t_0;
+	if(km>1)
+		{
+		z_t[1] += alpha[0]*y_t_1;
+		if(km>2)
+			{
+			z_t[2] += alpha[0]*y_t_2;
+			if(km>3)
+				{
+				z_t[3] += alpha[0]*y_t_3;
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x_n, double *z_n)
+	{
+
+	kernel_dsymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+
diff --git a/kernel/c99/kernel_sgecp_lib4.c b/kernel/c99/kernel_sgecp_lib4.c
new file mode 100644
index 0000000..de5b704
--- /dev/null
+++ b/kernel/c99/kernel_sgecp_lib4.c
@@ -0,0 +1,1148 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgesc_4_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+		A[1+bs*0] *= alpha;
+		A[2+bs*0] *= alpha;
+		A[3+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+void kernel_sgesc_3_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+		A[1+bs*0] *= alpha;
+		A[2+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+void kernel_sgesc_2_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+		A[1+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+void kernel_sgesc_1_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+		B[3+bs*0] = A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[1+bs*0];
+		B[1+bs*0] = A0[2+bs*0];
+		B[2+bs*0] = A0[3+bs*0];
+		B[3+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+		B[3+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+		B[3+bs*0] = A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+		B[3+bs*0] = A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	// 3x3 triangle
+
+	B[1+bs*0] = A[1+bs*0];
+	B[2+bs*0] = A[2+bs*0];
+	B[3+bs*0] = A[3+bs*0];
+
+	B[2+bs*1] = A[2+bs*1];
+	B[3+bs*1] = A[3+bs*1];
+
+	B[3+bs*2] = A[3+bs*2];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[1+bs*0];
+		B[1+bs*0] = A0[2+bs*0];
+		B[2+bs*0] = A0[3+bs*0];
+		B[3+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 3x3 triangle
+
+	B[1+0*bs] = A0[2+0*bs];
+	B[2+0*bs] = A0[3+0*bs];
+	B[3+0*bs] = A1[0+0*bs];
+
+	B[2+1*bs] = A0[3+1*bs];
+	B[3+1*bs] = A1[0+1*bs];
+
+	B[3+2*bs] = A1[0+2*bs];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+		B[3+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 3x3 triangle}
+
+	B[1+bs*0] = A0[3+bs*0];
+	B[2+bs*0] = A1[0+bs*0];
+	B[3+bs*0] = A1[1+bs*0];
+
+	B[2+bs*1] = A1[0+bs*1];
+	B[3+bs*1] = A1[1+bs*1];
+
+	B[3+bs*2] = A1[1+bs*2];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+		B[3+bs*0] = A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 3x3 triangle
+
+	B[1+bs*0] = A1[0+bs*0];
+	B[2+bs*0] = A1[1+bs*0];
+	B[3+bs*0] = A1[2+bs*0];
+
+	B[2+bs*1] = A1[1+bs*1];
+	B[3+bs*1] = A1[2+bs*1];
+
+	B[3+bs*2] = A1[2+bs*2];
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 3-wide + end 2x2 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	// 2x2 triangle
+
+	B[1+bs*0] = A[1+bs*0];
+	B[2+bs*0] = A[2+bs*0];
+
+	B[2+bs*1] = A[2+bs*1];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 3-wide + end 2x2 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 2x2 triangle
+
+	B[1+bs*0] = A0[3+bs*0];
+	B[2+bs*0] = A1[0+bs*0];
+
+	B[2+bs*1] = A1[0+bs*1];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 3-wide + end 2x2 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 2x2 triangle
+
+	B[1+bs*0] = A1[0+bs*0];
+	B[2+bs*0] = A1[1+bs*0];
+
+	B[2+bs*1] = A1[1+bs*1];
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 2-wide + end 1x1 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	// 1x1 triangle
+
+	B[1+bs*0] = A[1+bs*0];
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 2-wide + end 1x1 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 1x1 triangle
+
+	B[1+bs*0] = A1[0+bs*0];
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 1-wide
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+		B[3+bs*0] += alpha * A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[1+bs*0];
+		B[1+bs*0] += alpha * A0[2+bs*0];
+		B[2+bs*0] += alpha * A0[3+bs*0];
+		B[3+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+		B[3+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+		B[3+bs*0] += alpha * A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgemm_4x4_lib4.c b/kernel/c99/kernel_sgemm_4x4_lib4.c
new file mode 100644
index 0000000..243d559
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_4x4_lib4.c
@@ -0,0 +1,6094 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	float
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
+void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	kernel_sgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D)
+	{
+	kernel_sgemm_nn_4x4_vs_lib4(kmax, alpha, A, B, sdb, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+//	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+//	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+//	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+//	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+//	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+//	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	kernel_ssyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	k = 0;
+
+	// k = 0
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 1
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 2
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	kernel_strmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	float *D1;
+	
+	int k;
+
+	B += offsetB;
+
+	k = 0;
+
+	if(offsetB==0)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==1)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==2)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 5
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else // if(offetB==3)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	store:
+	
+	c_00 = alpha[0]*c_00;
+	c_10 = alpha[0]*c_10;
+	c_20 = alpha[0]*c_20;
+	c_30 = alpha[0]*c_30;
+
+	c_01 = alpha[0]*c_01;
+	c_11 = alpha[0]*c_11;
+	c_21 = alpha[0]*c_21;
+	c_31 = alpha[0]*c_31;
+
+	c_02 = alpha[0]*c_02;
+	c_12 = alpha[0]*c_12;
+	c_22 = alpha[0]*c_22;
+	c_32 = alpha[0]*c_32;
+
+	c_03 = alpha[0]*c_03;
+	c_13 = alpha[0]*c_13;
+	c_23 = alpha[0]*c_23;
+	c_33 = alpha[0]*c_33;
+
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
+	{
+	kernel_strmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+//	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+//	c_02 = C[0+bs*2] + c_02;
+//	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+//	c_03 = C[0+bs*3] + c_03;
+//	c_13 = C[1+bs*3] + c_13;
+//	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(c_00>0)
+		{
+		c_00 = sqrt(c_00);
+		tmp = 1.0/c_00;
+		}
+	else
+		{
+		c_00 = 0.0;
+		tmp = 0.0;
+		}
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+	
+	c_11 -= c_10 * c_10;
+	c_21 -= c_20 * c_10;
+	c_31 -= c_30 * c_10;
+	if(c_11>0)
+		{
+		c_11 = sqrt(c_11);
+		tmp = 1.0/c_11;
+		}
+	else
+		{
+		c_11 = 0.0;
+		tmp = 0.0;
+		}
+	c_21 *= tmp;
+	c_31 *= tmp;
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+	
+	c_22 -= c_20 * c_20;
+	c_32 -= c_30 * c_20;
+	c_22 -= c_21 * c_21;
+	c_32 -= c_31 * c_21;
+	if(c_22>0)
+		{
+		c_22 = sqrt(c_22);
+		tmp = 1.0/c_22;
+		}
+	else
+		{
+		c_22 = 0.0;
+		tmp = 0.0;
+		}
+	c_32 *= tmp;
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+	
+	c_33 -= c_30 * c_30;
+	c_33 -= c_31 * c_31;
+	c_33 -= c_32 * c_32;
+	if(c_33>0)
+		{
+		c_33 = sqrt(c_33);
+		tmp = 1.0/c_33;
+		}
+	else
+		{
+		c_33 = 0.0;
+		tmp = 0.0;
+		}
+	inv_diag_D[3] = tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
+	{
+	kernel_spotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
+	{
+	float alpha = 1.0;
+	float beta = 1.0;
+	kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
+	{
+	float alpha = 1.0;
+	float beta = 1.0;
+	kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+	tmp = inv_diag_E[1];
+	c_01 *= tmp;
+	c_11 *= tmp;
+	c_21 *= tmp;
+	c_31 *= tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+	tmp = inv_diag_E[2];
+	c_02 *= tmp;
+	c_12 *= tmp;
+	c_22 *= tmp;
+	c_32 *= tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+	tmp = inv_diag_E[3];
+	c_03 *= tmp;
+	c_13 *= tmp;
+	c_23 *= tmp;
+	c_33 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+	float alpha = 1.0;
+	float beta  = 1.0;
+	kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	float alpha = 1.0;
+	float beta  = 1.0;
+	kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E)
+	{
+	kernel_strsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+
+	if(kn>3)
+		{
+		tmp = inv_diag_E[3];
+		c_03 *= tmp;
+		c_13 *= tmp;
+		c_23 *= tmp;
+		c_33 *= tmp;
+		tmp = E[2+bs*3];
+		c_02 -= c_03 * tmp;
+		c_12 -= c_13 * tmp;
+		c_22 -= c_23 * tmp;
+		c_32 -= c_33 * tmp;
+		tmp = E[1+bs*3];
+		c_01 -= c_03 * tmp;
+		c_11 -= c_13 * tmp;
+		c_21 -= c_23 * tmp;
+		c_31 -= c_33 * tmp;
+		tmp = E[0+bs*3];
+		c_00 -= c_03 * tmp;
+		c_10 -= c_13 * tmp;
+		c_20 -= c_23 * tmp;
+		c_30 -= c_33 * tmp;
+		}
+
+	if(kn>2)
+		{
+		tmp = inv_diag_E[2];
+		c_02 *= tmp;
+		c_12 *= tmp;
+		c_22 *= tmp;
+		c_32 *= tmp;
+		tmp = E[1+bs*2];
+		c_01 -= c_02 * tmp;
+		c_11 -= c_12 * tmp;
+		c_21 -= c_22 * tmp;
+		c_31 -= c_32 * tmp;
+		tmp = E[0+bs*2];
+		c_00 -= c_02 * tmp;
+		c_10 -= c_12 * tmp;
+		c_20 -= c_22 * tmp;
+		c_30 -= c_32 * tmp;
+		}
+
+	if(kn>1)
+		{
+		tmp = inv_diag_E[1];
+		c_01 *= tmp;
+		c_11 *= tmp;
+		c_21 *= tmp;
+		c_31 *= tmp;
+		tmp = E[0+bs*1];
+		c_00 -= c_01 * tmp;
+		c_10 -= c_11 * tmp;
+		c_20 -= c_21 * tmp;
+		c_30 -= c_31 * tmp;
+		}
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// factorization
+
+	// first column
+	tmp = 1.0 / c_00;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+
+	// second column
+	c_11 -= c_10 * c_01;
+	c_21 -= c_20 * c_01;
+	c_31 -= c_30 * c_01;
+
+	tmp = 1.0 / c_11;
+	c_21 *= tmp;
+	c_31 *= tmp;
+	
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+
+	// third column
+	c_12 -= c_10 * c_02;
+	c_22 -= c_20 * c_02;
+	c_32 -= c_30 * c_02;
+
+	c_22 -= c_21 * c_12;
+	c_32 -= c_31 * c_12;
+
+	tmp = 1.0 / c_22;
+	c_32 *= tmp;
+
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+
+	// fourth column
+	c_13 -= c_10 * c_03;
+	c_23 -= c_20 * c_03;
+	c_33 -= c_30 * c_03;
+
+	c_23 -= c_21 * c_13;
+	c_33 -= c_31 * c_13;
+
+	c_33 -= c_32 * c_23;
+
+	tmp = 1.0 / c_33;
+
+	inv_diag_D[3] = tmp;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
+	{
+	kernel_sgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_1, e_2, e_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// solution
+
+	if(km==1)
+		goto store;
+	
+	e_1 = E[1+bs*0];
+	e_2 = E[2+bs*0];
+	e_3 = E[3+bs*0];
+	c_10 -= e_1 * c_00;
+	c_20 -= e_2 * c_00;
+	c_30 -= e_3 * c_00;
+	c_11 -= e_1 * c_01;
+	c_21 -= e_2 * c_01;
+	c_31 -= e_3 * c_01;
+	c_12 -= e_1 * c_02;
+	c_22 -= e_2 * c_02;
+	c_32 -= e_3 * c_02;
+	c_13 -= e_1 * c_03;
+	c_23 -= e_2 * c_03;
+	c_33 -= e_3 * c_03;
+
+	if(km==2)
+		goto store;
+	
+	e_2 = E[2+bs*1];
+	e_3 = E[3+bs*1];
+	c_20 -= e_2 * c_10;
+	c_30 -= e_3 * c_10;
+	c_21 -= e_2 * c_11;
+	c_31 -= e_3 * c_11;
+	c_22 -= e_2 * c_12;
+	c_32 -= e_3 * c_12;
+	c_23 -= e_2 * c_13;
+	c_33 -= e_3 * c_13;
+
+	if(km==3)
+		goto store;
+	
+	e_3 = E[3+bs*2];
+	c_30 -= e_3 * c_20;
+	c_31 -= e_3 * c_21;
+	c_32 -= e_3 * c_22;
+	c_33 -= e_3 * c_23;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
+	{
+	kernel_strsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+	
+	// solve
+
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_10 *= e_00;
+	c_20 *= e_00;
+	c_30 *= e_00;
+
+	if(kn==1)
+		goto store;
+	
+	e_01 = E[0+bs*1];
+	e_11 = inv_diag_E[1];
+	c_01 -= c_00 * e_01;
+	c_11 -= c_10 * e_01;
+	c_21 -= c_20 * e_01;
+	c_31 -= c_30 * e_01;
+	c_01 *= e_11;
+	c_11 *= e_11;
+	c_21 *= e_11;
+	c_31 *= e_11;
+
+	if(kn==2)
+		goto store;
+	
+	e_02 = E[0+bs*2];
+	e_12 = E[1+bs*2];
+	e_22 = inv_diag_E[2];
+	c_02 -= c_00 * e_02;
+	c_12 -= c_10 * e_02;
+	c_22 -= c_20 * e_02;
+	c_32 -= c_30 * e_02;
+	c_02 -= c_01 * e_12;
+	c_12 -= c_11 * e_12;
+	c_22 -= c_21 * e_12;
+	c_32 -= c_31 * e_12;
+	c_02 *= e_22;
+	c_12 *= e_22;
+	c_22 *= e_22;
+	c_32 *= e_22;
+
+	if(kn==3)
+		goto store;
+	
+	e_03 = E[0+bs*3];
+	e_13 = E[1+bs*3];
+	e_23 = E[2+bs*3];
+	e_33 = inv_diag_E[3];
+	c_03 -= c_00 * e_03;
+	c_13 -= c_10 * e_03;
+	c_23 -= c_20 * e_03;
+	c_33 -= c_30 * e_03;
+	c_03 -= c_01 * e_13;
+	c_13 -= c_11 * e_13;
+	c_23 -= c_21 * e_13;
+	c_33 -= c_31 * e_13;
+	c_03 -= c_02 * e_23;
+	c_13 -= c_12 * e_23;
+	c_23 -= c_22 * e_23;
+	c_33 -= c_32 * e_23;
+	c_03 *= e_33;
+	c_13 *= e_33;
+	c_23 *= e_33;
+	c_33 *= e_33;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+//	printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+//	printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+//	printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+//	printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+	
+	// solve
+
+	if(km>3)
+		{
+		e_03 = E[0+bs*3];
+		e_13 = E[1+bs*3];
+		e_23 = E[2+bs*3];
+		e_33 = inv_diag_E[3];
+		c_30 *= e_33;
+		c_31 *= e_33;
+		c_32 *= e_33;
+		c_33 *= e_33;
+		c_00 -= e_03 * c_30;
+		c_01 -= e_03 * c_31;
+		c_02 -= e_03 * c_32;
+		c_03 -= e_03 * c_33;
+		c_10 -= e_13 * c_30;
+		c_11 -= e_13 * c_31;
+		c_12 -= e_13 * c_32;
+		c_13 -= e_13 * c_33;
+		c_20 -= e_23 * c_30;
+		c_21 -= e_23 * c_31;
+		c_22 -= e_23 * c_32;
+		c_23 -= e_23 * c_33;
+		}
+	
+	if(km>2)
+		{
+		e_02 = E[0+bs*2];
+		e_12 = E[1+bs*2];
+		e_22 = inv_diag_E[2];
+		c_20 *= e_22;
+		c_21 *= e_22;
+		c_22 *= e_22;
+		c_23 *= e_22;
+		c_00 -= e_02 * c_20;
+		c_01 -= e_02 * c_21;
+		c_02 -= e_02 * c_22;
+		c_03 -= e_02 * c_23;
+		c_10 -= e_12 * c_20;
+		c_11 -= e_12 * c_21;
+		c_12 -= e_12 * c_22;
+		c_13 -= e_12 * c_23;
+		}
+	
+	if(km>1)
+		{
+		e_01 = E[0+bs*1];
+		e_11 = inv_diag_E[1];
+		c_10 *= e_11;
+		c_11 *= e_11;
+		c_12 *= e_11;
+		c_13 *= e_11;
+		c_00 -= e_01 * c_10;
+		c_01 -= e_01 * c_11;
+		c_02 -= e_01 * c_12;
+		c_03 -= e_01 * c_13;
+		}
+	
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_01 *= e_00;
+	c_02 *= e_00;
+	c_03 *= e_00;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
diff --git a/kernel/c99/kernel_sgemm_diag_lib4.c b/kernel/c99/kernel_sgemm_diag_lib4.c
new file mode 100644
index 0000000..93df707
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_diag_lib4.c
@@ -0,0 +1,1112 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_0;
+		c_2 = a_2 * b_0;
+		c_3 = a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = a_0 * b_1;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_1;
+		c_3 = a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = a_0 * b_2;
+		c_1 = a_1 * b_2;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = a_0 * b_3;
+		c_1 = a_1 * b_3;
+		c_2 = a_2 * b_3;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1,
+		b_0, b_1,
+		c_0, c_1;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0,
+		b_0,
+		c_0;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+		
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		b_0 = B[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+		D[0+bs*1] = c_0;
+		
+
+		b_0 = B[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+		D[0+bs*2] = c_0;
+		
+
+		b_0 = B[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+		D[0+bs*3] = c_0;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+		
+	}
+#endif
+
+
+
diff --git a/kernel/c99/kernel_sgemv_4_lib4.c b/kernel/c99/kernel_sgemv_4_lib4.c
new file mode 100644
index 0000000..03975f4
--- /dev/null
+++ b/kernel/c99/kernel_sgemv_4_lib4.c
@@ -0,0 +1,1010 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		x_0,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		x_0 = x[1];
+
+		y_0 += A[0+bs*1] * x_0;
+		y_1 += A[1+bs*1] * x_0;
+		y_2 += A[2+bs*1] * x_0;
+		y_3 += A[3+bs*1] * x_0;
+		
+		x_0 = x[2];
+
+		y_0 += A[0+bs*2] * x_0;
+		y_1 += A[1+bs*2] * x_0;
+		y_2 += A[2+bs*2] * x_0;
+		y_3 += A[3+bs*2] * x_0;
+		
+		x_0 = x[3];
+
+		y_0 += A[0+bs*3] * x_0;
+		y_1 += A[1+bs*3] * x_0;
+		y_2 += A[2+bs*3] * x_0;
+		y_3 += A[3+bs*3] * x_0;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(k0<=0 & k1>3)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		if(k0<=0 & k1>0) z[0] = y_0;
+		if(k0<=1 & k1>1) z[1] = y_1;
+		if(k0<=2 & k1>2) z[2] = y_2;
+		if(k0<=3 & k1>3) z[3] = y_3;
+		}
+
+	}
+#endif
+	
+	
+	
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
+	{
+
+	kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
+	{
+
+	kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int k, kend;
+	
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	if(offA!=0) // 1, 2, 3
+		{
+		kend = 4-offA<kmax ? 4-offA : kmax;
+		for(; k<kend; k++)
+			{
+			
+			x_0 = x[0];
+		
+			y_0 += A[0+bs*0] * x_0;
+			y_1 += A[0+bs*1] * x_0;
+			y_2 += A[0+bs*2] * x_0;
+			y_3 += A[0+bs*3] * x_0;
+		
+			A += 1;
+			x += 1;
+			
+			}
+		A += bs*(sda-1);
+		}
+	for(; k<kmax-bs+1; k+=bs)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+	
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+	
+		A += 1;
+		x += 1;
+		
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
+	{
+
+	kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
+	{
+
+	kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[1+bs*0] * x_0;
+		y_2 -= A[2+bs*0] * x_0;
+		y_3 -= A[3+bs*0] * x_0;
+
+		y_0 -= A[0+bs*1] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[2+bs*1] * x_1;
+		y_3 -= A[3+bs*1] * x_1;
+
+		y_0 -= A[0+bs*2] * x_2;
+		y_1 -= A[1+bs*2] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[3+bs*2] * x_2;
+
+		y_0 -= A[0+bs*3] * x_3;
+		y_1 -= A[1+bs*3] * x_3;
+		y_2 -= A[2+bs*3] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	float
+		a_00, a_10, a_20, a_30,
+		a_11, a_21, a_31;
+	
+	// a_00
+	a_00 = inv_diag_A[0];
+	a_10 = A[1+bs*0];
+	a_20 = A[2+bs*0];
+	a_30 = A[3+bs*0];
+	y_0 *= a_00;
+	z[0] = y_0;
+	y_1 -= a_10 * y_0;
+	y_2 -= a_20 * y_0;
+	y_3 -= a_30 * y_0;
+
+	if(kn==1)
+		{
+		if(km==1)
+			return;
+		y[1] = y_1;
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_11
+	a_11 = inv_diag_A[1];
+	a_21 = A[2+bs*1];
+	a_31 = A[3+bs*1];
+	y_1 *= a_11;	
+	z[1] = y_1;
+	y_2 -= a_21 * y_1;
+	y_3 -= a_31 * y_1;
+
+	if(kn==2)
+		{
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_22
+	a_00 = inv_diag_A[2];
+	a_10 = A[3+bs*2];
+	y_2 *= a_00;
+	z[2] = y_2;
+	y_3 -= a_10 * y_2;
+
+	if(kn==3)
+		{
+		if(km==3)
+			return;
+		y[3] = y_3;
+
+		return;
+		}
+
+	// a_33
+	a_11 = inv_diag_A[3];
+	y_3 *= a_11;	
+	z[3] = y_3;
+
+	}
+#endif
+	
+
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+	}
+#endif
+	
+	
+		
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=4;
+	A += 4 + (sda-1)*bs;
+	x += 4;
+	for(; k<kmax-3; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[1+bs*2] * x_1;
+		y_3 -= A[1+bs*3] * x_1;
+		
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[2+bs*3] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+	
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_3 *= inv_diag_A[3];
+	z[3] = y_3;
+
+	y_2 -= A[3+bs*2] * y_3;
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+	y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0;
+	
+	k = 3;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_3 = x[3];
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+			y_2 -= A[0+bs*2] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			y_2 -= A[1+bs*2] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+			y_2 -= A[2+bs*2] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			y_2 -= A[3+bs*2] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 3;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2;
+	y_1 -= A[2+bs*1]*y_2;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0;
+	
+	k = 2;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 2;
+		x += 2;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0;
+	
+	k = 1;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_0 -= A[2+bs*0] * x_2;
+		y_0 -= A[3+bs*0] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_0 -= A[1+bs*0] * x_1;
+			y_0 -= A[2+bs*0] * x_2;
+			y_0 -= A[3+bs*0] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 1;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+
+	y_0 += A[0+bs*0] * x_0;
+/*	y_1 += A[1+bs*0] * x_0;*/
+/*	y_2 += A[2+bs*0] * x_0;*/
+/*	y_3 += A[3+bs*0] * x_0;*/
+
+	y_0 += A[0+bs*1] * x_1;
+	y_1 += A[1+bs*1] * x_1;
+/*	y_2 += A[2+bs*1] * x_1;*/
+/*	y_3 += A[3+bs*1] * x_1;*/
+
+	y_0 += A[0+bs*2] * x_2;
+	y_1 += A[1+bs*2] * x_2;
+	y_2 += A[2+bs*2] * x_2;
+/*	y_3 += A[3+bs*2] * x_2;*/
+
+	y_0 += A[0+bs*3] * x_3;
+	y_1 += A[1+bs*3] * x_3;
+	y_2 += A[2+bs*3] * x_3;
+	y_3 += A[3+bs*3] * x_3;
+	
+	A += 4*bs;
+	x += 4;
+
+	k=4;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+
+		y_0 += A[0+bs*1] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[2+bs*1] * x_1;
+		y_3 += A[3+bs*1] * x_1;
+
+		y_0 += A[0+bs*2] * x_2;
+		y_1 += A[1+bs*2] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[3+bs*2] * x_2;
+
+		y_0 += A[0+bs*3] * x_3;
+		y_1 += A[1+bs*3] * x_3;
+		y_2 += A[2+bs*3] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	z[0] = y_0;
+	z[1] = y_1;
+	z[2] = y_2;
+	z[3] = y_3;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int
+		k;
+	
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-4; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+	
+	y_0 += A[0+bs*0] * x_0;
+	y_1 += A[0+bs*1] * x_0;
+	y_2 += A[0+bs*2] * x_0;
+	y_3 += A[0+bs*3] * x_0;
+
+/*	y_0 += A[1+bs*0] * x_1;*/
+	y_1 += A[1+bs*1] * x_1;
+	y_2 += A[1+bs*2] * x_1;
+	y_3 += A[1+bs*3] * x_1;
+	
+/*	y_0 += A[2+bs*0] * x_2;*/
+/*	y_1 += A[2+bs*1] * x_2;*/
+	y_2 += A[2+bs*2] * x_2;
+	y_3 += A[2+bs*3] * x_2;
+
+/*	y_0 += A[3+bs*0] * x_3;*/
+/*	y_1 += A[3+bs*1] * x_3;*/
+/*	y_2 += A[3+bs*2] * x_3;*/
+	y_3 += A[3+bs*3] * x_3;
+	
+//	A += sda*bs;
+//	x += 4;
+
+	// store_vs
+	store:
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
+	{
+	
+	kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+	}
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgetrf_pivot_4_lib4.c b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..fdec8de
--- /dev/null
+++ b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
@@ -0,0 +1,786 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_s_aux.h"
+
+
+
+// C numbering, starting from 0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
+	{
+
+	int idamax, ii;
+	float tmp, amax;
+		
+	p_idamax[0] = -1;
+	if(n<1)
+		return;
+
+	const int bs = 4;
+
+	int na = (bs - offset%bs)%bs;
+	na = n<na ? n : na;
+
+	amax = -1.0;
+	ii = 0;
+	if(na>0)
+		{
+		for( ; ii<na; ii++)
+			{
+			tmp = fabs(pA[0]);
+			if(tmp>amax)
+				{
+				idamax = ii+0;
+				amax = tmp;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		tmp = fabs(pA[1]);
+		if(tmp>amax)
+			{
+			idamax = ii+1;
+			amax = tmp;
+			}
+		tmp = fabs(pA[2]);
+		if(tmp>amax)
+			{
+			idamax = ii+2;
+			amax = tmp;
+			}
+		tmp = fabs(pA[3]);
+		if(tmp>amax)
+			{
+			idamax = ii+3;
+			amax = tmp;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<n; ii++)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		pA += 1;
+		}
+	
+	p_amax[0] = amax;
+	p_idamax[0] = idamax;
+
+	return;
+
+	}
+#endif
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	float
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	float
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+	sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		pA[1+bs*0] *= tmp0;
+		pA[2+bs*0] *= tmp0;
+		pA[3+bs*0] *= tmp0;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB[1+bs*0] *= tmp0;
+			pB[2+bs*0] *= tmp0;
+			pB[3+bs*0] *= tmp0;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+
+	// second column
+	u_01  = pA[0+bs*1];
+	tmp1  = pA[1+bs*1];
+	tmp2  = pA[2+bs*1];
+	tmp3  = pA[3+bs*1];
+	tmp1 -= pA[1+bs*0] * u_01;
+	tmp2 -= pA[2+bs*0] * u_01;
+	tmp3 -= pA[3+bs*0] * u_01;
+	pA[1+bs*1] = tmp1;
+	pA[2+bs*1] = tmp2;
+	pA[3+bs*1] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*1];
+		tmp1  = pB[1+bs*1];
+		tmp2  = pB[2+bs*1];
+		tmp3  = pB[3+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		tmp1 -= pB[1+bs*0] * u_01;
+		tmp2 -= pB[2+bs*0] * u_01;
+		tmp3 -= pB[3+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB[1+bs*1] = tmp1;
+		pB[2+bs*1] = tmp2;
+		pB[3+bs*1] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0 = pB[0+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB += 1;
+		}
+
+	sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		pA[2+bs*1] *= tmp1;
+		pA[3+bs*1] *= tmp1;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB[1+bs*1] *= tmp1;
+			pB[2+bs*1] *= tmp1;
+			pB[3+bs*1] *= tmp1;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	// third column
+	u_02  = pA[0+bs*2];
+	u_12  = pA[1+bs*2];
+	u_12 -= pA[1+bs*0] * u_02;
+	pA[1+bs*2] = u_12;
+	tmp2  = pA[2+bs*2];
+	tmp3  = pA[3+bs*2];
+	tmp2 -= pA[2+bs*0] * u_02;
+	tmp3 -= pA[3+bs*0] * u_02;
+	tmp2 -= pA[2+bs*1] * u_12;
+	tmp3 -= pA[3+bs*1] * u_12;
+	pA[2+bs*2] = tmp2;
+	pA[3+bs*2] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp1  = pB[1+bs*2];
+		tmp2  = pB[2+bs*2];
+		tmp3  = pB[3+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp1 -= pB[1+bs*0] * u_02;
+		tmp2 -= pB[2+bs*0] * u_02;
+		tmp3 -= pB[3+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		tmp1 -= pB[1+bs*1] * u_12;
+		tmp2 -= pB[2+bs*1] * u_12;
+		tmp3 -= pB[3+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB[1+bs*2] = tmp1;
+		pB[2+bs*2] = tmp2;
+		pB[3+bs*2] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB += 1;
+		}
+
+	sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+	ipiv[2] = idamax+2;
+	if(tmp2!=0)
+		{
+		if(ipiv[2]!=2)
+			srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		tmp2 = 1.0 / pA[2+bs*2];
+		inv_diag_A[2] = tmp2;
+		pA[3+bs*2] *= tmp2;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB[1+bs*2] *= tmp2;
+			pB[2+bs*2] *= tmp2;
+			pB[3+bs*2] *= tmp2;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[2] = 0.0;
+		}
+
+	// fourth column
+	u_03  = pA[0+bs*3];
+	u_13  = pA[1+bs*3];
+	u_13 -= pA[1+bs*0] * u_03;
+	pA[1+bs*3] = u_13;
+	u_23  = pA[2+bs*3];
+	u_23 -= pA[2+bs*0] * u_03;
+	u_23 -= pA[2+bs*1] * u_13;
+	pA[2+bs*3] = u_23;
+	tmp3  = pA[3+bs*3];
+	tmp3 -= pA[3+bs*0] * u_03;
+	tmp3 -= pA[3+bs*1] * u_13;
+	tmp3 -= pA[3+bs*2] * u_23;
+	pA[3+bs*3] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp1  = pB[1+bs*3];
+		tmp2  = pB[2+bs*3];
+		tmp3  = pB[3+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp1 -= pB[1+bs*0] * u_03;
+		tmp2 -= pB[2+bs*0] * u_03;
+		tmp3 -= pB[3+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp1 -= pB[1+bs*1] * u_13;
+		tmp2 -= pB[2+bs*1] * u_13;
+		tmp3 -= pB[3+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		tmp1 -= pB[1+bs*2] * u_23;
+		tmp2 -= pB[2+bs*2] * u_23;
+		tmp3 -= pB[3+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB[1+bs*3] = tmp1;
+		pB[2+bs*3] = tmp2;
+		pB[3+bs*3] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB += 1;
+		}
+
+	sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+	ipiv[3] = idamax+3;
+	if(tmp3!=0)
+		{
+		if(ipiv[3]!=3)
+			srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		tmp3 = 1.0 / pA[3+bs*3];
+		inv_diag_A[3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB[1+bs*3] *= tmp3;
+			pB[2+bs*3] *= tmp3;
+			pB[3+bs*3] *= tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[3] = 0.0;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	float
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	float
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+
+	// find pivot & scale
+	sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		if(m>=4)
+			{
+			pA[1+bs*0] *= tmp0;
+			pA[2+bs*0] *= tmp0;
+			pA[3+bs*0] *= tmp0;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB[1+bs*0] *= tmp0;
+				pB[2+bs*0] *= tmp0;
+				pB[3+bs*0] *= tmp0;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB += 1;
+				}
+			}
+		else // m = {1,2,3}
+			{
+			if(m>1)
+				{
+				pA[1+bs*0] *= tmp0;
+				if(m>2)
+					pA[2+bs*0] *= tmp0;
+				}
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+		return;
+
+	// second column
+
+	// correct
+	if(m>=4)
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp2  = pA[2+bs*1];
+		tmp3  = pA[3+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		tmp2 -= pA[2+bs*0] * u_01;
+		tmp3 -= pA[3+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		pA[2+bs*1] = tmp2;
+		pA[3+bs*1] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*1];
+			tmp1  = pB[1+bs*1];
+			tmp2  = pB[2+bs*1];
+			tmp3  = pB[3+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			tmp1 -= pB[1+bs*0] * u_01;
+			tmp2 -= pB[2+bs*0] * u_01;
+			tmp3 -= pB[3+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB[1+bs*1] = tmp1;
+			pB[2+bs*1] = tmp2;
+			pB[3+bs*1] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0 = pB[0+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*1];
+			tmp2 -= pA[2+bs*0] * u_01;
+			pA[2+bs*1] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		if(m>=4)
+			{
+			pA[2+bs*1] *= tmp1;
+			pA[3+bs*1] *= tmp1;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB[1+bs*1] *= tmp1;
+				pB[2+bs*1] *= tmp1;
+				pB[3+bs*1] *= tmp1;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB += 1;
+				}
+			}
+		else // m = {2,3}
+			{
+			if(m>2)
+				pA[2+bs*1] *= tmp1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	if(n==2)
+		return;
+
+	// third column
+
+	// correct
+	if(m>=4)
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		tmp2  = pA[2+bs*2];
+		tmp3  = pA[3+bs*2];
+		tmp2 -= pA[2+bs*0] * u_02;
+		tmp3 -= pA[3+bs*0] * u_02;
+		tmp2 -= pA[2+bs*1] * u_12;
+		tmp3 -= pA[3+bs*1] * u_12;
+		pA[2+bs*2] = tmp2;
+		pA[3+bs*2] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp1  = pB[1+bs*2];
+			tmp2  = pB[2+bs*2];
+			tmp3  = pB[3+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp1 -= pB[1+bs*0] * u_02;
+			tmp2 -= pB[2+bs*0] * u_02;
+			tmp3 -= pB[3+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			tmp1 -= pB[1+bs*1] * u_12;
+			tmp2 -= pB[2+bs*1] * u_12;
+			tmp3 -= pB[3+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB[1+bs*2] = tmp1;
+			pB[2+bs*2] = tmp2;
+			pB[3+bs*2] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*2];
+			tmp2 -= pA[2+bs*0] * u_02;
+			tmp2 -= pA[2+bs*1] * u_12;
+			pA[2+bs*2] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	if(m>2)
+		{
+		sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+		ipiv[2] = idamax+2;
+		if(tmp2!=0)
+			{
+			if(ipiv[2]!=2)
+				srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			tmp2 = 1.0 / pA[2+bs*2];
+			inv_diag_A[2] = tmp2;
+			if(m>=4)
+				{
+				pA[3+bs*2] *= tmp2;
+				pB = pA + bs*sda;
+				for(k=0; k<ma-3; k+=4)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB[1+bs*2] *= tmp2;
+					pB[2+bs*2] *= tmp2;
+					pB[3+bs*2] *= tmp2;
+					pB += bs*sda;
+					}
+				for( ; k<ma; k++)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB += 1;
+					}
+				}
+			}
+		else
+			{
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n<4)
+		return;
+
+	// fourth column
+
+	// correct
+	if(m>=4)
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		u_23  = pA[2+bs*3];
+		u_23 -= pA[2+bs*0] * u_03;
+		u_23 -= pA[2+bs*1] * u_13;
+		pA[2+bs*3] = u_23;
+		tmp3  = pA[3+bs*3];
+		tmp3 -= pA[3+bs*0] * u_03;
+		tmp3 -= pA[3+bs*1] * u_13;
+		tmp3 -= pA[3+bs*2] * u_23;
+		pA[3+bs*3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp1  = pB[1+bs*3];
+			tmp2  = pB[2+bs*3];
+			tmp3  = pB[3+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp1 -= pB[1+bs*0] * u_03;
+			tmp2 -= pB[2+bs*0] * u_03;
+			tmp3 -= pB[3+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp1 -= pB[1+bs*1] * u_13;
+			tmp2 -= pB[2+bs*1] * u_13;
+			tmp3 -= pB[3+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			tmp1 -= pB[1+bs*2] * u_23;
+			tmp2 -= pB[2+bs*2] * u_23;
+			tmp3 -= pB[3+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB[1+bs*3] = tmp1;
+			pB[2+bs*3] = tmp2;
+			pB[3+bs*3] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		if(m>2)
+			{
+			u_23  = pA[2+bs*3];
+			u_23 -= pA[2+bs*0] * u_03;
+			u_23 -= pA[2+bs*1] * u_13;
+			pA[2+bs*3] = u_23;
+			}
+		}
+
+	if(m>3)
+		{
+		// find pivot & scale
+		sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+		ipiv[3] = idamax+3;
+		if(tmp3!=0)
+			{
+			if(ipiv[3]!=3)
+				srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			tmp3 = 1.0 / pA[3+bs*3];
+			inv_diag_A[3] = tmp3;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB[1+bs*3] *= tmp3;
+				pB[2+bs*3] *= tmp3;
+				pB[3+bs*3] *= tmp3;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB += 1;
+				}
+			}
+		else
+			{
+			inv_diag_A[3] = 0.0;
+			}
+		}
+	
+	return;
+
+	}
+#endif
+
+
+	
+
+
+
diff --git a/kernel/c99/kernel_ssymv_4_lib4.c b/kernel/c99/kernel_ssymv_4_lib4.c
new file mode 100644
index 0000000..5512154
--- /dev/null
+++ b/kernel/c99/kernel_ssymv_4_lib4.c
@@ -0,0 +1,1025 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha_n[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha_n[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha_n[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha_n[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	// store t
+	z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+	if(km>1)
+		{
+		z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+		if(km>2)
+			{
+			z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+			if(km>3)
+				{
+				z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
+	{
+
+	kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	float *x_t = x_n;
+	float *z_t = z_n;
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	if(offA==0)
+		{
+		if(kmax<4)
+			{
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+			if(kmax==1)
+				goto store_t;
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+			if(kmax==2)
+				goto store_t;
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+			goto store_t;
+			}
+		else
+			{
+
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+
+			// 3
+
+			y_n_0 = z_n[3]; 
+			x_t_0 = x_t[3];
+
+			a_00 = A[3+bs*0];
+			a_01 = A[3+bs*1];
+			a_02 = A[3+bs*2];
+			a_03 = A[3+bs*3];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_n_0 += a_02 * x_n_2;
+			y_t_2 += a_02 * x_t_0;
+			y_t_3 += a_03 * x_t_0;
+
+			z_n[3] = y_n_0;
+
+			k += 4;
+			A += sda*bs;
+			z_n += 4;
+			x_t += 4;
+
+			}
+		}
+	else if(offA==1)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==6)
+			goto store_t;
+
+		// 6
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==7)
+			goto store_t;
+
+		k += 7;
+
+		}
+	else if(offA==2)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==6)
+			goto store_t;
+
+		k += 6;
+
+		}
+	else // if(offA==3)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==5)
+			goto store_t;
+
+		k += 5;
+
+		}
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	store_t:
+	z_t[0] += alpha[0]*y_t_0;
+	if(km>1)
+		{
+		z_t[1] += alpha[0]*y_t_1;
+		if(km>2)
+			{
+			z_t[2] += alpha[0]*y_t_2;
+			if(km>3)
+				{
+				z_t[3] += alpha[0]*y_t_3;
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
+	{
+
+	kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+
+