Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/c99/kernel_sgemv_4_lib4.c b/kernel/c99/kernel_sgemv_4_lib4.c
new file mode 100644
index 0000000..03975f4
--- /dev/null
+++ b/kernel/c99/kernel_sgemv_4_lib4.c
@@ -0,0 +1,1010 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		x_0,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		x_0 = x[1];
+
+		y_0 += A[0+bs*1] * x_0;
+		y_1 += A[1+bs*1] * x_0;
+		y_2 += A[2+bs*1] * x_0;
+		y_3 += A[3+bs*1] * x_0;
+		
+		x_0 = x[2];
+
+		y_0 += A[0+bs*2] * x_0;
+		y_1 += A[1+bs*2] * x_0;
+		y_2 += A[2+bs*2] * x_0;
+		y_3 += A[3+bs*2] * x_0;
+		
+		x_0 = x[3];
+
+		y_0 += A[0+bs*3] * x_0;
+		y_1 += A[1+bs*3] * x_0;
+		y_2 += A[2+bs*3] * x_0;
+		y_3 += A[3+bs*3] * x_0;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(k0<=0 & k1>3)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		if(k0<=0 & k1>0) z[0] = y_0;
+		if(k0<=1 & k1>1) z[1] = y_1;
+		if(k0<=2 & k1>2) z[2] = y_2;
+		if(k0<=3 & k1>3) z[3] = y_3;
+		}
+
+	}
+#endif
+	
+	
+	
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
+	{
+
+	kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
+	{
+
+	kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int k, kend;
+	
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	if(offA!=0) // 1, 2, 3
+		{
+		kend = 4-offA<kmax ? 4-offA : kmax;
+		for(; k<kend; k++)
+			{
+			
+			x_0 = x[0];
+		
+			y_0 += A[0+bs*0] * x_0;
+			y_1 += A[0+bs*1] * x_0;
+			y_2 += A[0+bs*2] * x_0;
+			y_3 += A[0+bs*3] * x_0;
+		
+			A += 1;
+			x += 1;
+			
+			}
+		A += bs*(sda-1);
+		}
+	for(; k<kmax-bs+1; k+=bs)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+	
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+	
+		A += 1;
+		x += 1;
+		
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
+	{
+
+	kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
+	{
+
+	kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[1+bs*0] * x_0;
+		y_2 -= A[2+bs*0] * x_0;
+		y_3 -= A[3+bs*0] * x_0;
+
+		y_0 -= A[0+bs*1] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[2+bs*1] * x_1;
+		y_3 -= A[3+bs*1] * x_1;
+
+		y_0 -= A[0+bs*2] * x_2;
+		y_1 -= A[1+bs*2] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[3+bs*2] * x_2;
+
+		y_0 -= A[0+bs*3] * x_3;
+		y_1 -= A[1+bs*3] * x_3;
+		y_2 -= A[2+bs*3] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	float
+		a_00, a_10, a_20, a_30,
+		a_11, a_21, a_31;
+	
+	// a_00
+	a_00 = inv_diag_A[0];
+	a_10 = A[1+bs*0];
+	a_20 = A[2+bs*0];
+	a_30 = A[3+bs*0];
+	y_0 *= a_00;
+	z[0] = y_0;
+	y_1 -= a_10 * y_0;
+	y_2 -= a_20 * y_0;
+	y_3 -= a_30 * y_0;
+
+	if(kn==1)
+		{
+		if(km==1)
+			return;
+		y[1] = y_1;
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_11
+	a_11 = inv_diag_A[1];
+	a_21 = A[2+bs*1];
+	a_31 = A[3+bs*1];
+	y_1 *= a_11;	
+	z[1] = y_1;
+	y_2 -= a_21 * y_1;
+	y_3 -= a_31 * y_1;
+
+	if(kn==2)
+		{
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_22
+	a_00 = inv_diag_A[2];
+	a_10 = A[3+bs*2];
+	y_2 *= a_00;
+	z[2] = y_2;
+	y_3 -= a_10 * y_2;
+
+	if(kn==3)
+		{
+		if(km==3)
+			return;
+		y[3] = y_3;
+
+		return;
+		}
+
+	// a_33
+	a_11 = inv_diag_A[3];
+	y_3 *= a_11;	
+	z[3] = y_3;
+
+	}
+#endif
+	
+
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+	}
+#endif
+	
+	
+		
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=4;
+	A += 4 + (sda-1)*bs;
+	x += 4;
+	for(; k<kmax-3; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[1+bs*2] * x_1;
+		y_3 -= A[1+bs*3] * x_1;
+		
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[2+bs*3] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+	
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_3 *= inv_diag_A[3];
+	z[3] = y_3;
+
+	y_2 -= A[3+bs*2] * y_3;
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+	y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0;
+	
+	k = 3;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_3 = x[3];
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+			y_2 -= A[0+bs*2] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			y_2 -= A[1+bs*2] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+			y_2 -= A[2+bs*2] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			y_2 -= A[3+bs*2] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 3;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2;
+	y_1 -= A[2+bs*1]*y_2;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0;
+	
+	k = 2;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 2;
+		x += 2;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0;
+	
+	k = 1;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_0 -= A[2+bs*0] * x_2;
+		y_0 -= A[3+bs*0] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_0 -= A[1+bs*0] * x_1;
+			y_0 -= A[2+bs*0] * x_2;
+			y_0 -= A[3+bs*0] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 1;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+
+	y_0 += A[0+bs*0] * x_0;
+/*	y_1 += A[1+bs*0] * x_0;*/
+/*	y_2 += A[2+bs*0] * x_0;*/
+/*	y_3 += A[3+bs*0] * x_0;*/
+
+	y_0 += A[0+bs*1] * x_1;
+	y_1 += A[1+bs*1] * x_1;
+/*	y_2 += A[2+bs*1] * x_1;*/
+/*	y_3 += A[3+bs*1] * x_1;*/
+
+	y_0 += A[0+bs*2] * x_2;
+	y_1 += A[1+bs*2] * x_2;
+	y_2 += A[2+bs*2] * x_2;
+/*	y_3 += A[3+bs*2] * x_2;*/
+
+	y_0 += A[0+bs*3] * x_3;
+	y_1 += A[1+bs*3] * x_3;
+	y_2 += A[2+bs*3] * x_3;
+	y_3 += A[3+bs*3] * x_3;
+	
+	A += 4*bs;
+	x += 4;
+
+	k=4;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+
+		y_0 += A[0+bs*1] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[2+bs*1] * x_1;
+		y_3 += A[3+bs*1] * x_1;
+
+		y_0 += A[0+bs*2] * x_2;
+		y_1 += A[1+bs*2] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[3+bs*2] * x_2;
+
+		y_0 += A[0+bs*3] * x_3;
+		y_1 += A[1+bs*3] * x_3;
+		y_2 += A[2+bs*3] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	z[0] = y_0;
+	z[1] = y_1;
+	z[2] = y_2;
+	z[3] = y_3;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int
+		k;
+	
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-4; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+	
+	y_0 += A[0+bs*0] * x_0;
+	y_1 += A[0+bs*1] * x_0;
+	y_2 += A[0+bs*2] * x_0;
+	y_3 += A[0+bs*3] * x_0;
+
+/*	y_0 += A[1+bs*0] * x_1;*/
+	y_1 += A[1+bs*1] * x_1;
+	y_2 += A[1+bs*2] * x_1;
+	y_3 += A[1+bs*3] * x_1;
+	
+/*	y_0 += A[2+bs*0] * x_2;*/
+/*	y_1 += A[2+bs*1] * x_2;*/
+	y_2 += A[2+bs*2] * x_2;
+	y_3 += A[2+bs*3] * x_2;
+
+/*	y_0 += A[3+bs*0] * x_3;*/
+/*	y_1 += A[3+bs*1] * x_3;*/
+/*	y_2 += A[3+bs*2] * x_3;*/
+	y_3 += A[3+bs*3] * x_3;
+	
+//	A += sda*bs;
+//	x += 4;
+
+	// store_vs
+	store:
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
+	{
+	
+	kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+	}
+#endif
+
+
+
+
+
+