Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/c99/kernel_ssymv_4_lib4.c b/kernel/c99/kernel_ssymv_4_lib4.c
new file mode 100644
index 0000000..5512154
--- /dev/null
+++ b/kernel/c99/kernel_ssymv_4_lib4.c
@@ -0,0 +1,1025 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
+ {
+
+ kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ float *x_t = x_n;
+ float *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
+ {
+
+ kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
+