blob: 03975f4751ab180a2aaec1177cc28b0c0910167f [file] [log] [blame]
/**************************************************************************************************
* *
* This file is part of BLASFEO. *
* *
* BLASFEO -- BLAS For Embedded Optimization. *
* Copyright (C) 2016-2017 by Gianluca Frison. *
* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
* All rights reserved. *
* *
* HPMPC is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public *
* License as published by the Free Software Foundation; either *
* version 2.1 of the License, or (at your option) any later version. *
* *
* HPMPC is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* See the GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public *
* License along with HPMPC; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
* *
* Author: Gianluca Frison, giaf (at) dtu.dk *
* gianluca.frison (at) imtek.uni-freiburg.de *
* *
**************************************************************************************************/
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
{
const int bs = 4;
int k;
float
x_0,
y_0=0, y_1=0, y_2=0, y_3=0;
k=0;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
y_0 += A[0+bs*0] * x_0;
y_1 += A[1+bs*0] * x_0;
y_2 += A[2+bs*0] * x_0;
y_3 += A[3+bs*0] * x_0;
x_0 = x[1];
y_0 += A[0+bs*1] * x_0;
y_1 += A[1+bs*1] * x_0;
y_2 += A[2+bs*1] * x_0;
y_3 += A[3+bs*1] * x_0;
x_0 = x[2];
y_0 += A[0+bs*2] * x_0;
y_1 += A[1+bs*2] * x_0;
y_2 += A[2+bs*2] * x_0;
y_3 += A[3+bs*2] * x_0;
x_0 = x[3];
y_0 += A[0+bs*3] * x_0;
y_1 += A[1+bs*3] * x_0;
y_2 += A[2+bs*3] * x_0;
y_3 += A[3+bs*3] * x_0;
A += 4*bs;
x += 4;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 += A[0+bs*0] * x_0;
y_1 += A[1+bs*0] * x_0;
y_2 += A[2+bs*0] * x_0;
y_3 += A[3+bs*0] * x_0;
A += 1*bs;
x += 1;
}
y_0 = alpha[0]*y_0 + beta[0]*y[0];
y_1 = alpha[0]*y_1 + beta[0]*y[1];
y_2 = alpha[0]*y_2 + beta[0]*y[2];
y_3 = alpha[0]*y_3 + beta[0]*y[3];
if(k0<=0 & k1>3)
{
z[0] = y_0;
z[1] = y_1;
z[2] = y_2;
z[3] = y_3;
}
else
{
if(k0<=0 & k1>0) z[0] = y_0;
if(k0<=1 & k1>1) z[1] = y_1;
if(k0<=2 & k1>2) z[2] = y_2;
if(k0<=3 & k1>3) z[3] = y_3;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
{
kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
{
kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
{
const int bs = 4;
int k, kend;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0, y_2=0, y_3=0;
k=0;
if(offA!=0) // 1, 2, 3
{
kend = 4-offA<kmax ? 4-offA : kmax;
for(; k<kend; k++)
{
x_0 = x[0];
y_0 += A[0+bs*0] * x_0;
y_1 += A[0+bs*1] * x_0;
y_2 += A[0+bs*2] * x_0;
y_3 += A[0+bs*3] * x_0;
A += 1;
x += 1;
}
A += bs*(sda-1);
}
for(; k<kmax-bs+1; k+=bs)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 += A[0+bs*0] * x_0;
y_1 += A[0+bs*1] * x_0;
y_2 += A[0+bs*2] * x_0;
y_3 += A[0+bs*3] * x_0;
y_0 += A[1+bs*0] * x_1;
y_1 += A[1+bs*1] * x_1;
y_2 += A[1+bs*2] * x_1;
y_3 += A[1+bs*3] * x_1;
y_0 += A[2+bs*0] * x_2;
y_1 += A[2+bs*1] * x_2;
y_2 += A[2+bs*2] * x_2;
y_3 += A[2+bs*3] * x_2;
y_0 += A[3+bs*0] * x_3;
y_1 += A[3+bs*1] * x_3;
y_2 += A[3+bs*2] * x_3;
y_3 += A[3+bs*3] * x_3;
A += sda*bs;
x += 4;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 += A[0+bs*0] * x_0;
y_1 += A[0+bs*1] * x_0;
y_2 += A[0+bs*2] * x_0;
y_3 += A[0+bs*3] * x_0;
A += 1;
x += 1;
}
y_0 = alpha[0]*y_0 + beta[0]*y[0];
y_1 = alpha[0]*y_1 + beta[0]*y[1];
y_2 = alpha[0]*y_2 + beta[0]*y[2];
y_3 = alpha[0]*y_3 + beta[0]*y[3];
if(km>=4)
{
z[0] = y_0;
z[1] = y_1;
z[2] = y_2;
z[3] = y_3;
}
else
{
z[0] = y_0;
if(km>=2)
{
z[1] = y_1;
if(km>2)
{
z[2] = y_2;
}
}
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
{
kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
{
kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
{
const int bs = 4;
int k;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0, y_2=0, y_3=0;
k=0;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[1+bs*0] * x_0;
y_2 -= A[2+bs*0] * x_0;
y_3 -= A[3+bs*0] * x_0;
y_0 -= A[0+bs*1] * x_1;
y_1 -= A[1+bs*1] * x_1;
y_2 -= A[2+bs*1] * x_1;
y_3 -= A[3+bs*1] * x_1;
y_0 -= A[0+bs*2] * x_2;
y_1 -= A[1+bs*2] * x_2;
y_2 -= A[2+bs*2] * x_2;
y_3 -= A[3+bs*2] * x_2;
y_0 -= A[0+bs*3] * x_3;
y_1 -= A[1+bs*3] * x_3;
y_2 -= A[2+bs*3] * x_3;
y_3 -= A[3+bs*3] * x_3;
A += 4*bs;
x += 4;
}
y_0 = y[0] + y_0;
y_1 = y[1] + y_1;
y_2 = y[2] + y_2;
y_3 = y[3] + y_3;
float
a_00, a_10, a_20, a_30,
a_11, a_21, a_31;
// a_00
a_00 = inv_diag_A[0];
a_10 = A[1+bs*0];
a_20 = A[2+bs*0];
a_30 = A[3+bs*0];
y_0 *= a_00;
z[0] = y_0;
y_1 -= a_10 * y_0;
y_2 -= a_20 * y_0;
y_3 -= a_30 * y_0;
if(kn==1)
{
if(km==1)
return;
y[1] = y_1;
if(km==2)
return;
y[2] = y_2;
if(km==3)
return;
y[3] = y_3;
return;
}
// a_11
a_11 = inv_diag_A[1];
a_21 = A[2+bs*1];
a_31 = A[3+bs*1];
y_1 *= a_11;
z[1] = y_1;
y_2 -= a_21 * y_1;
y_3 -= a_31 * y_1;
if(kn==2)
{
if(km==2)
return;
y[2] = y_2;
if(km==3)
return;
y[3] = y_3;
return;
}
// a_22
a_00 = inv_diag_A[2];
a_10 = A[3+bs*2];
y_2 *= a_00;
z[2] = y_2;
y_3 -= a_10 * y_2;
if(kn==3)
{
if(km==3)
return;
y[3] = y_3;
return;
}
// a_33
a_11 = inv_diag_A[3];
y_3 *= a_11;
z[3] = y_3;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
{
kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
{
const int bs = 4;
int
k;
float *tA, *tx;
tA = A;
tx = x;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0, y_2=0, y_3=0;
k=4;
A += 4 + (sda-1)*bs;
x += 4;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[0+bs*1] * x_0;
y_2 -= A[0+bs*2] * x_0;
y_3 -= A[0+bs*3] * x_0;
y_0 -= A[1+bs*0] * x_1;
y_1 -= A[1+bs*1] * x_1;
y_2 -= A[1+bs*2] * x_1;
y_3 -= A[1+bs*3] * x_1;
y_0 -= A[2+bs*0] * x_2;
y_1 -= A[2+bs*1] * x_2;
y_2 -= A[2+bs*2] * x_2;
y_3 -= A[2+bs*3] * x_2;
y_0 -= A[3+bs*0] * x_3;
y_1 -= A[3+bs*1] * x_3;
y_2 -= A[3+bs*2] * x_3;
y_3 -= A[3+bs*3] * x_3;
A += sda*bs;
x += 4;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[0+bs*1] * x_0;
y_2 -= A[0+bs*2] * x_0;
y_3 -= A[0+bs*3] * x_0;
A += 1;//sda*bs;
x += 1;
}
y_0 = y[0] + y_0;
y_1 = y[1] + y_1;
y_2 = y[2] + y_2;
y_3 = y[3] + y_3;
A = tA;
x = tx;
// bottom trinagle
y_3 *= inv_diag_A[3];
z[3] = y_3;
y_2 -= A[3+bs*2] * y_3;
y_2 *= inv_diag_A[2];
z[2] = y_2;
// square
y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
// top trinagle
y_1 *= inv_diag_A[1];
z[1] = y_1;
y_0 -= A[1+bs*0] * y_1;
y_0 *= inv_diag_A[0];
z[0] = y_0;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
{
const int bs = 4;
int
k;
float *tA, *tx;
tA = A;
tx = x;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0, y_2=0;
k = 3;
if(kmax>4)
{
// clean up at the beginning
x_3 = x[3];
y_0 -= A[3+bs*0] * x_3;
y_1 -= A[3+bs*1] * x_3;
y_2 -= A[3+bs*2] * x_3;
k=4;
A += 4 + (sda-1)*bs;
x += 4;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[0+bs*1] * x_0;
y_2 -= A[0+bs*2] * x_0;
y_0 -= A[1+bs*0] * x_1;
y_1 -= A[1+bs*1] * x_1;
y_2 -= A[1+bs*2] * x_1;
y_0 -= A[2+bs*0] * x_2;
y_1 -= A[2+bs*1] * x_2;
y_2 -= A[2+bs*2] * x_2;
y_0 -= A[3+bs*0] * x_3;
y_1 -= A[3+bs*1] * x_3;
y_2 -= A[3+bs*2] * x_3;
A += sda*bs;
x += 4;
}
}
else
{
A += 3;
x += 1;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[0+bs*1] * x_0;
y_2 -= A[0+bs*2] * x_0;
A += 1;//sda*bs;
x += 1;
}
y_0 = y[0] + y_0;
y_1 = y[1] + y_1;
y_2 = y[2] + y_2;
A = tA;
x = tx;
// bottom trinagle
y_2 *= inv_diag_A[2];
z[2] = y_2;
// square
y_0 -= A[2+bs*0]*y_2;
y_1 -= A[2+bs*1]*y_2;
// top trinagle
y_1 *= inv_diag_A[1];
z[1] = y_1;
y_0 -= A[1+bs*0] * y_1;
y_0 *= inv_diag_A[0];
z[0] = y_0;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
{
const int bs = 4;
int
k;
float *tA, *tx;
tA = A;
tx = x;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0;
k = 2;
if(kmax>4)
{
// clean up at the beginning
x_2 = x[2];
x_3 = x[3];
y_0 -= A[2+bs*0] * x_2;
y_1 -= A[2+bs*1] * x_2;
y_0 -= A[3+bs*0] * x_3;
y_1 -= A[3+bs*1] * x_3;
k=4;
A += 4 + (sda-1)*bs;
x += 4;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[0+bs*1] * x_0;
y_0 -= A[1+bs*0] * x_1;
y_1 -= A[1+bs*1] * x_1;
y_0 -= A[2+bs*0] * x_2;
y_1 -= A[2+bs*1] * x_2;
y_0 -= A[3+bs*0] * x_3;
y_1 -= A[3+bs*1] * x_3;
A += sda*bs;
x += 4;
}
}
else
{
A += 2;
x += 2;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 -= A[0+bs*0] * x_0;
y_1 -= A[0+bs*1] * x_0;
A += 1;//sda*bs;
x += 1;
}
y_0 = y[0] + y_0;
y_1 = y[1] + y_1;
A = tA;
x = tx;
// top trinagle
y_1 *= inv_diag_A[1];
z[1] = y_1;
y_0 -= A[1+bs*0] * y_1;
y_0 *= inv_diag_A[0];
z[0] = y_0;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
{
const int bs = 4;
int
k;
float *tA, *tx;
tA = A;
tx = x;
float
x_0, x_1, x_2, x_3,
y_0=0;
k = 1;
if(kmax>4)
{
// clean up at the beginning
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 -= A[1+bs*0] * x_1;
y_0 -= A[2+bs*0] * x_2;
y_0 -= A[3+bs*0] * x_3;
k=4;
A += 4 + (sda-1)*bs;
x += 4;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 -= A[0+bs*0] * x_0;
y_0 -= A[1+bs*0] * x_1;
y_0 -= A[2+bs*0] * x_2;
y_0 -= A[3+bs*0] * x_3;
A += sda*bs;
x += 4;
}
}
else
{
A += 1;
x += 1;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 -= A[0+bs*0] * x_0;
A += 1;//sda*bs;
x += 1;
}
y_0 = y[0] + y_0;
A = tA;
x = tx;
// top trinagle
y_0 *= inv_diag_A[0];
z[0] = y_0;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
{
const int bs = 4;
int k;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0, y_2=0, y_3=0;
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 += A[0+bs*0] * x_0;
/* y_1 += A[1+bs*0] * x_0;*/
/* y_2 += A[2+bs*0] * x_0;*/
/* y_3 += A[3+bs*0] * x_0;*/
y_0 += A[0+bs*1] * x_1;
y_1 += A[1+bs*1] * x_1;
/* y_2 += A[2+bs*1] * x_1;*/
/* y_3 += A[3+bs*1] * x_1;*/
y_0 += A[0+bs*2] * x_2;
y_1 += A[1+bs*2] * x_2;
y_2 += A[2+bs*2] * x_2;
/* y_3 += A[3+bs*2] * x_2;*/
y_0 += A[0+bs*3] * x_3;
y_1 += A[1+bs*3] * x_3;
y_2 += A[2+bs*3] * x_3;
y_3 += A[3+bs*3] * x_3;
A += 4*bs;
x += 4;
k=4;
for(; k<kmax-3; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 += A[0+bs*0] * x_0;
y_1 += A[1+bs*0] * x_0;
y_2 += A[2+bs*0] * x_0;
y_3 += A[3+bs*0] * x_0;
y_0 += A[0+bs*1] * x_1;
y_1 += A[1+bs*1] * x_1;
y_2 += A[2+bs*1] * x_1;
y_3 += A[3+bs*1] * x_1;
y_0 += A[0+bs*2] * x_2;
y_1 += A[1+bs*2] * x_2;
y_2 += A[2+bs*2] * x_2;
y_3 += A[3+bs*2] * x_2;
y_0 += A[0+bs*3] * x_3;
y_1 += A[1+bs*3] * x_3;
y_2 += A[2+bs*3] * x_3;
y_3 += A[3+bs*3] * x_3;
A += 4*bs;
x += 4;
}
for(; k<kmax; k++)
{
x_0 = x[0];
y_0 += A[0+bs*0] * x_0;
y_1 += A[1+bs*0] * x_0;
y_2 += A[2+bs*0] * x_0;
y_3 += A[3+bs*0] * x_0;
A += 1*bs;
x += 1;
}
z[0] = y_0;
z[1] = y_1;
z[2] = y_2;
z[3] = y_3;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
{
const int bs = 4;
int
k;
float
x_0, x_1, x_2, x_3,
y_0=0, y_1=0, y_2=0, y_3=0;
k=0;
for(; k<kmax-4; k+=4)
{
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 += A[0+bs*0] * x_0;
y_1 += A[0+bs*1] * x_0;
y_2 += A[0+bs*2] * x_0;
y_3 += A[0+bs*3] * x_0;
y_0 += A[1+bs*0] * x_1;
y_1 += A[1+bs*1] * x_1;
y_2 += A[1+bs*2] * x_1;
y_3 += A[1+bs*3] * x_1;
y_0 += A[2+bs*0] * x_2;
y_1 += A[2+bs*1] * x_2;
y_2 += A[2+bs*2] * x_2;
y_3 += A[2+bs*3] * x_2;
y_0 += A[3+bs*0] * x_3;
y_1 += A[3+bs*1] * x_3;
y_2 += A[3+bs*2] * x_3;
y_3 += A[3+bs*3] * x_3;
A += sda*bs;
x += 4;
}
x_0 = x[0];
x_1 = x[1];
x_2 = x[2];
x_3 = x[3];
y_0 += A[0+bs*0] * x_0;
y_1 += A[0+bs*1] * x_0;
y_2 += A[0+bs*2] * x_0;
y_3 += A[0+bs*3] * x_0;
/* y_0 += A[1+bs*0] * x_1;*/
y_1 += A[1+bs*1] * x_1;
y_2 += A[1+bs*2] * x_1;
y_3 += A[1+bs*3] * x_1;
/* y_0 += A[2+bs*0] * x_2;*/
/* y_1 += A[2+bs*1] * x_2;*/
y_2 += A[2+bs*2] * x_2;
y_3 += A[2+bs*3] * x_2;
/* y_0 += A[3+bs*0] * x_3;*/
/* y_1 += A[3+bs*1] * x_3;*/
/* y_2 += A[3+bs*2] * x_3;*/
y_3 += A[3+bs*3] * x_3;
// A += sda*bs;
// x += 4;
// store_vs
store:
if(km>=4)
{
z[0] = y_0;
z[1] = y_1;
z[2] = y_2;
z[3] = y_3;
}
else
{
z[0] = y_0;
if(km>=2)
{
z[1] = y_1;
if(km>2)
{
z[2] = y_2;
}
}
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
{
kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
}
#endif