Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/c99/kernel_dgemm_4x4_lib4.c b/kernel/c99/kernel_dgemm_4x4_lib4.c
new file mode 100644
index 0000000..167e356
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_4x4_lib4.c
@@ -0,0 +1,6825 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC)
+void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	k = 0;
+	if(offsetB!=0)
+		{
+		if(offsetB==1)
+			{
+
+			B += 1;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		else if(offsetB==2)
+			{
+
+			B += 2;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		else // if(offsetB==3)
+			{
+
+			B += 3;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		}
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}	
+	
+	scale:
+
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+	{
+	kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0,
+		c_10=0, c_11=0,
+		c_20=0, c_21=0, c_22=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0,
+		c_10=0, c_11=0,
+		c_20=0, c_21=0, c_22=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[2+bs*2] = c_22;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	k = 0;
+
+	// k = 0
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 1
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 2
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double *D1;
+	
+	int k;
+
+	B += offsetB;
+
+	k = 0;
+
+	if(offsetB==0)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==1)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==2)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 5
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else // if(offetB==3)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	store:
+	
+	c_00 = alpha[0]*c_00;
+	c_10 = alpha[0]*c_10;
+	c_20 = alpha[0]*c_20;
+	c_30 = alpha[0]*c_30;
+
+	c_01 = alpha[0]*c_01;
+	c_11 = alpha[0]*c_11;
+	c_21 = alpha[0]*c_21;
+	c_31 = alpha[0]*c_31;
+
+	c_02 = alpha[0]*c_02;
+	c_12 = alpha[0]*c_12;
+	c_22 = alpha[0]*c_22;
+	c_32 = alpha[0]*c_32;
+
+	c_03 = alpha[0]*c_03;
+	c_13 = alpha[0]*c_13;
+	c_23 = alpha[0]*c_23;
+	c_33 = alpha[0]*c_33;
+
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC)  || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
+	{
+	kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+//	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+//	c_02 = C[0+bs*2] + c_02;
+//	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+//	c_03 = C[0+bs*3] + c_03;
+//	c_13 = C[1+bs*3] + c_13;
+//	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(c_00>0)
+		{
+		c_00 = sqrt(c_00);
+		tmp = 1.0/c_00;
+		}
+	else
+		{
+		c_00 = 0.0;
+		tmp = 0.0;
+		}
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+	
+	c_11 -= c_10 * c_10;
+	c_21 -= c_20 * c_10;
+	c_31 -= c_30 * c_10;
+	if(c_11>0)
+		{
+		c_11 = sqrt(c_11);
+		tmp = 1.0/c_11;
+		}
+	else
+		{
+		c_11 = 0.0;
+		tmp = 0.0;
+		}
+	c_21 *= tmp;
+	c_31 *= tmp;
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+	
+	c_22 -= c_20 * c_20;
+	c_32 -= c_30 * c_20;
+	c_22 -= c_21 * c_21;
+	c_32 -= c_31 * c_21;
+	if(c_22>0)
+		{
+		c_22 = sqrt(c_22);
+		tmp = 1.0/c_22;
+		}
+	else
+		{
+		c_22 = 0.0;
+		tmp = 0.0;
+		}
+	c_32 *= tmp;
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+	
+	c_33 -= c_30 * c_30;
+	c_33 -= c_31 * c_31;
+	c_33 -= c_32 * c_32;
+	if(c_33>0)
+		{
+		c_33 = sqrt(c_33);
+		tmp = 1.0/c_33;
+		}
+	else
+		{
+		c_33 = 0.0;
+		tmp = 0.0;
+		}
+	inv_diag_D[3] = tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
+	{
+	kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+	double alpha = 1.0;
+	double beta = 1.0;
+	kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
+	{
+	double alpha = 1.0;
+	double beta = 1.0;
+	kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+	tmp = inv_diag_E[1];
+	c_01 *= tmp;
+	c_11 *= tmp;
+	c_21 *= tmp;
+	c_31 *= tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+	tmp = inv_diag_E[2];
+	c_02 *= tmp;
+	c_12 *= tmp;
+	c_22 *= tmp;
+	c_32 *= tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+	tmp = inv_diag_E[3];
+	c_03 *= tmp;
+	c_13 *= tmp;
+	c_23 *= tmp;
+	c_33 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+	double alpha = 1.0;
+	double beta  = 1.0;
+	kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	double alpha = 1.0;
+	double beta  = 1.0;
+	kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
+	{
+	kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+
+	if(kn>3)
+		{
+		tmp = inv_diag_E[3];
+		c_03 *= tmp;
+		c_13 *= tmp;
+		c_23 *= tmp;
+		c_33 *= tmp;
+		tmp = E[2+bs*3];
+		c_02 -= c_03 * tmp;
+		c_12 -= c_13 * tmp;
+		c_22 -= c_23 * tmp;
+		c_32 -= c_33 * tmp;
+		tmp = E[1+bs*3];
+		c_01 -= c_03 * tmp;
+		c_11 -= c_13 * tmp;
+		c_21 -= c_23 * tmp;
+		c_31 -= c_33 * tmp;
+		tmp = E[0+bs*3];
+		c_00 -= c_03 * tmp;
+		c_10 -= c_13 * tmp;
+		c_20 -= c_23 * tmp;
+		c_30 -= c_33 * tmp;
+		}
+
+	if(kn>2)
+		{
+		tmp = inv_diag_E[2];
+		c_02 *= tmp;
+		c_12 *= tmp;
+		c_22 *= tmp;
+		c_32 *= tmp;
+		tmp = E[1+bs*2];
+		c_01 -= c_02 * tmp;
+		c_11 -= c_12 * tmp;
+		c_21 -= c_22 * tmp;
+		c_31 -= c_32 * tmp;
+		tmp = E[0+bs*2];
+		c_00 -= c_02 * tmp;
+		c_10 -= c_12 * tmp;
+		c_20 -= c_22 * tmp;
+		c_30 -= c_32 * tmp;
+		}
+
+	if(kn>1)
+		{
+		tmp = inv_diag_E[1];
+		c_01 *= tmp;
+		c_11 *= tmp;
+		c_21 *= tmp;
+		c_31 *= tmp;
+		tmp = E[0+bs*1];
+		c_00 -= c_01 * tmp;
+		c_10 -= c_11 * tmp;
+		c_20 -= c_21 * tmp;
+		c_30 -= c_31 * tmp;
+		}
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// factorization
+
+	// first column
+	tmp = 1.0 / c_00;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+
+	// second column
+	c_11 -= c_10 * c_01;
+	c_21 -= c_20 * c_01;
+	c_31 -= c_30 * c_01;
+
+	tmp = 1.0 / c_11;
+	c_21 *= tmp;
+	c_31 *= tmp;
+	
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+
+	// third column
+	c_12 -= c_10 * c_02;
+	c_22 -= c_20 * c_02;
+	c_32 -= c_30 * c_02;
+
+	c_22 -= c_21 * c_12;
+	c_32 -= c_31 * c_12;
+
+	tmp = 1.0 / c_22;
+	c_32 *= tmp;
+
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+
+	// fourth column
+	c_13 -= c_10 * c_03;
+	c_23 -= c_20 * c_03;
+	c_33 -= c_30 * c_03;
+
+	c_23 -= c_21 * c_13;
+	c_33 -= c_31 * c_13;
+
+	c_33 -= c_32 * c_23;
+
+	tmp = 1.0 / c_33;
+
+	inv_diag_D[3] = tmp;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
+	{
+	kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_1, e_2, e_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// solution
+
+	if(km==1)
+		goto store;
+	
+	e_1 = E[1+bs*0];
+	e_2 = E[2+bs*0];
+	e_3 = E[3+bs*0];
+	c_10 -= e_1 * c_00;
+	c_20 -= e_2 * c_00;
+	c_30 -= e_3 * c_00;
+	c_11 -= e_1 * c_01;
+	c_21 -= e_2 * c_01;
+	c_31 -= e_3 * c_01;
+	c_12 -= e_1 * c_02;
+	c_22 -= e_2 * c_02;
+	c_32 -= e_3 * c_02;
+	c_13 -= e_1 * c_03;
+	c_23 -= e_2 * c_03;
+	c_33 -= e_3 * c_03;
+
+	if(km==2)
+		goto store;
+	
+	e_2 = E[2+bs*1];
+	e_3 = E[3+bs*1];
+	c_20 -= e_2 * c_10;
+	c_30 -= e_3 * c_10;
+	c_21 -= e_2 * c_11;
+	c_31 -= e_3 * c_11;
+	c_22 -= e_2 * c_12;
+	c_32 -= e_3 * c_12;
+	c_23 -= e_2 * c_13;
+	c_33 -= e_3 * c_13;
+
+	if(km==3)
+		goto store;
+	
+	e_3 = E[3+bs*2];
+	c_30 -= e_3 * c_20;
+	c_31 -= e_3 * c_21;
+	c_32 -= e_3 * c_22;
+	c_33 -= e_3 * c_23;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
+	{
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+	
+	// solve
+
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_10 *= e_00;
+	c_20 *= e_00;
+	c_30 *= e_00;
+
+	if(kn==1)
+		goto store;
+	
+	e_01 = E[0+bs*1];
+	e_11 = inv_diag_E[1];
+	c_01 -= c_00 * e_01;
+	c_11 -= c_10 * e_01;
+	c_21 -= c_20 * e_01;
+	c_31 -= c_30 * e_01;
+	c_01 *= e_11;
+	c_11 *= e_11;
+	c_21 *= e_11;
+	c_31 *= e_11;
+
+	if(kn==2)
+		goto store;
+	
+	e_02 = E[0+bs*2];
+	e_12 = E[1+bs*2];
+	e_22 = inv_diag_E[2];
+	c_02 -= c_00 * e_02;
+	c_12 -= c_10 * e_02;
+	c_22 -= c_20 * e_02;
+	c_32 -= c_30 * e_02;
+	c_02 -= c_01 * e_12;
+	c_12 -= c_11 * e_12;
+	c_22 -= c_21 * e_12;
+	c_32 -= c_31 * e_12;
+	c_02 *= e_22;
+	c_12 *= e_22;
+	c_22 *= e_22;
+	c_32 *= e_22;
+
+	if(kn==3)
+		goto store;
+	
+	e_03 = E[0+bs*3];
+	e_13 = E[1+bs*3];
+	e_23 = E[2+bs*3];
+	e_33 = inv_diag_E[3];
+	c_03 -= c_00 * e_03;
+	c_13 -= c_10 * e_03;
+	c_23 -= c_20 * e_03;
+	c_33 -= c_30 * e_03;
+	c_03 -= c_01 * e_13;
+	c_13 -= c_11 * e_13;
+	c_23 -= c_21 * e_13;
+	c_33 -= c_31 * e_13;
+	c_03 -= c_02 * e_23;
+	c_13 -= c_12 * e_23;
+	c_23 -= c_22 * e_23;
+	c_33 -= c_32 * e_23;
+	c_03 *= e_33;
+	c_13 *= e_33;
+	c_23 *= e_33;
+	c_33 *= e_33;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+//	printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+//	printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+//	printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+//	printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+	
+	// solve
+
+	if(km>3)
+		{
+		e_03 = E[0+bs*3];
+		e_13 = E[1+bs*3];
+		e_23 = E[2+bs*3];
+		e_33 = inv_diag_E[3];
+		c_30 *= e_33;
+		c_31 *= e_33;
+		c_32 *= e_33;
+		c_33 *= e_33;
+		c_00 -= e_03 * c_30;
+		c_01 -= e_03 * c_31;
+		c_02 -= e_03 * c_32;
+		c_03 -= e_03 * c_33;
+		c_10 -= e_13 * c_30;
+		c_11 -= e_13 * c_31;
+		c_12 -= e_13 * c_32;
+		c_13 -= e_13 * c_33;
+		c_20 -= e_23 * c_30;
+		c_21 -= e_23 * c_31;
+		c_22 -= e_23 * c_32;
+		c_23 -= e_23 * c_33;
+		}
+	
+	if(km>2)
+		{
+		e_02 = E[0+bs*2];
+		e_12 = E[1+bs*2];
+		e_22 = inv_diag_E[2];
+		c_20 *= e_22;
+		c_21 *= e_22;
+		c_22 *= e_22;
+		c_23 *= e_22;
+		c_00 -= e_02 * c_20;
+		c_01 -= e_02 * c_21;
+		c_02 -= e_02 * c_22;
+		c_03 -= e_02 * c_23;
+		c_10 -= e_12 * c_20;
+		c_11 -= e_12 * c_21;
+		c_12 -= e_12 * c_22;
+		c_13 -= e_12 * c_23;
+		}
+	
+	if(km>1)
+		{
+		e_01 = E[0+bs*1];
+		e_11 = inv_diag_E[1];
+		c_10 *= e_11;
+		c_11 *= e_11;
+		c_12 *= e_11;
+		c_13 *= e_11;
+		c_00 -= e_01 * c_10;
+		c_01 -= e_01 * c_11;
+		c_02 -= e_01 * c_12;
+		c_03 -= e_01 * c_13;
+		}
+	
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_01 *= e_00;
+	c_02 *= e_00;
+	c_03 *= e_00;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+